Skip to content

Commit

Permalink
🐛 Source File: add data presence validation for HTTPS for `check_co…
Browse files Browse the repository at this point in the history
…nnection` (airbytehq#17504)
  • Loading branch information
bazarnov authored and jhammarstedt committed Oct 31, 2022
1 parent c69138b commit c26ff15
Show file tree
Hide file tree
Showing 8 changed files with 42 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@
- name: File
sourceDefinitionId: 778daa7c-feaf-4db6-96f3-70fd645acc77
dockerRepository: airbyte/source-file
dockerImageTag: 0.2.23
dockerImageTag: 0.2.24
documentationUrl: https://docs.airbyte.io/integrations/sources/file
icon: file.svg
sourceType: file
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3092,7 +3092,7 @@
supportsNormalization: false
supportsDBT: false
supported_destination_sync_modes: []
- dockerImage: "airbyte/source-file:0.2.23"
- dockerImage: "airbyte/source-file:0.2.24"
spec:
documentationUrl: "https://docs.airbyte.io/integrations/sources/file"
connectionSpecification:
Expand Down
2 changes: 1 addition & 1 deletion airbyte-integrations/connectors/source-file/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,5 @@ COPY source_file ./source_file
ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]

LABEL io.airbyte.version=0.2.23
LABEL io.airbyte.version=0.2.24
LABEL io.airbyte.name=airbyte/source-file
8 changes: 4 additions & 4 deletions airbyte-integrations/connectors/source-file/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,10 @@ and place them into `secrets/config.json`.

### Locally running the connector
```
python main_dev.py spec
python main_dev.py check --config secrets/config.json
python main_dev.py discover --config secrets/config.json
python main_dev.py read --config secrets/config.json --catalog sample_files/configured_catalog.json
python main.py spec
python main.py check --config secrets/config.json
python main.py discover --config secrets/config.json
python main.py read --config secrets/config.json --catalog sample_files/configured_catalog.json
```

### Unit Tests
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
Type,
)
from airbyte_cdk.sources import Source
from pandas.errors import ParserError

from .client import Client

Expand Down Expand Up @@ -84,8 +85,21 @@ def check(self, logger, config: Mapping) -> AirbyteConnectionStatus:
client = self._get_client(config)
logger.info(f"Checking access to {client.reader.full_url}...")
try:
with client.reader.open():
with client.reader.open() as f:
if config.get("provider").get("storage") == "HTTPS":
# on behalf of https://github.com/airbytehq/alpha-beta-issues/issues/224
# some providers like Dropbox creates the Shared Public URLs with ?dl=0 query param,
# this requires user interaction before accessing the file,
# we should validate this on the Check Connection stage to avoid sync issues further.
client.CSV_CHUNK_SIZE = 2
next(client.load_dataframes(f))
return AirbyteConnectionStatus(status=Status.SUCCEEDED)
# for all other formats and storrage providers
return AirbyteConnectionStatus(status=Status.SUCCEEDED)
except ParserError:
reason = f"Failed to load {client.reader.full_url}, check the URL is valid and allows to download file directly"
logger.error(reason)
return AirbyteConnectionStatus(status=Status.FAILED, message=reason)
except Exception as err:
reason = f"Failed to load {client.reader.full_url}: {repr(err)}\n{traceback.format_exc()}"
logger.error(reason)
Expand Down
13 changes: 13 additions & 0 deletions airbyte-integrations/connectors/source-file/unit_tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,19 @@ def invalid_config(read_file):
}


@pytest.fixture
def non_direct_url_provided_config():
return {
"dataset_name": "test",
"format": "csv",
"url": "https://www.dropbox.com/s/tcxj6fzwuwyfusq/CSV_Test.csv?dl=0",
"provider": {
"storage": "HTTPS",
"user_agent": False,
},
}


@pytest.fixture
def client():
return Client(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,12 @@ def test_check_invalid_config(source, invalid_config):
assert actual.status == expected.status


def test_check_non_direct_url_provided_config(source, non_direct_url_provided_config):
expected = AirbyteConnectionStatus(status=Status.FAILED)
actual = source.check(logger=logger, config=non_direct_url_provided_config)
assert actual.status == expected.status


def test_discover(source, config, client):
catalog = source.discover(logger=logger, config=config)
catalog = AirbyteMessage(type=Type.CATALOG, catalog=catalog).dict(exclude_unset=True)
Expand Down
1 change: 1 addition & 0 deletions docs/integrations/sources/file.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ In order to read large files from a remote location, this connector uses the [sm

| Version | Date | Pull Request | Subject |
| ------- | ---------- | -------------------------------------------------------- | -------------------------------------------------------- |
| 0.2.24 | 2022-10-03 | [17504](https://github.com/airbytehq/airbyte/pull/17504) | Validate data for `HTTPS` while `check_connection` |
| 0.2.23 | 2022-09-28 | [17304](https://github.com/airbytehq/airbyte/pull/17304) | Migrate to per-stream state. |
| 0.2.22 | 2022-09-15 | [16772](https://github.com/airbytehq/airbyte/pull/16772) | Fix schema generation for JSON files containing arrays |
| 0.2.21 | 2022-08-26 | [15568](https://github.com/airbytehq/airbyte/pull/15568) | Specify `pyxlsb` library for Excel Binary Workbook files |
Expand Down

0 comments on commit c26ff15

Please sign in to comment.