From 2d71749b27d129d6f42a04247678ae875ed76cae Mon Sep 17 00:00:00 2001 From: Arsen Losenko Date: Wed, 10 Aug 2022 13:51:33 +0300 Subject: [PATCH 1/6] Specify engine for excel files --- .../connectors/source-file/source_file/client.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/airbyte-integrations/connectors/source-file/source_file/client.py b/airbyte-integrations/connectors/source-file/source_file/client.py index 99e6f31877ba..cc6af16d84c7 100644 --- a/airbyte-integrations/connectors/source-file/source_file/client.py +++ b/airbyte-integrations/connectors/source-file/source_file/client.py @@ -319,6 +319,9 @@ def load_dataframes(self, fp, skip_data=False) -> Iterable: reader_options["nrows"] = 0 reader_options["index_col"] = 0 + yield from reader(fp, **reader_options) + elif self._reader_options == "excel": + reader_options["engine"] = "pyxlsb" yield from reader(fp, **reader_options) else: yield reader(fp, **reader_options) From 304ab63d1b92dfecc17ebc7bd3e2129cb6c9b2e8 Mon Sep 17 00:00:00 2001 From: Arsen Losenko Date: Fri, 26 Aug 2022 14:31:37 +0300 Subject: [PATCH 2/6] Add test for the chagne related to pyxlsb --- .../integration_tests/sample_files/test.xlsb | Bin 0 -> 7976 bytes .../source-file/source_file/client.py | 5 ++-- .../source-file/unit_tests/conftest.py | 1 + .../source-file/unit_tests/test_client.py | 25 ++++++++++-------- 4 files changed, 18 insertions(+), 13 deletions(-) create mode 100644 airbyte-integrations/connectors/source-file/integration_tests/sample_files/test.xlsb diff --git a/airbyte-integrations/connectors/source-file/integration_tests/sample_files/test.xlsb b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/test.xlsb new file mode 100644 index 0000000000000000000000000000000000000000..f0f3082f8adbbdfb0e34dee5e2324eebbb107dcd GIT binary patch literal 7976 zcmeHMgL^t4JDq{^S~KZdFyT@R*@$TK1-)afo=gvWoQQOQUy1Wf^;v{$vM(d8gQ1w zp!w>pgqdWjQ8l%t_T# zGM$+{l_66o<;nvDu?HT2Va0nMj;qQT4oL`RwLRaFvrBUO|Ax^#u3az$Fb`ZBqYZg@cMBR`Ax5ed~mcf z@b%(bL7mGFBSlvqO*qybR|bp|lE|Z|yUmZ>JO#fwIn#EOI(LIkEcfKFpGaR1-I791 z7m}_YhpgwWJVH&6`sKg`;QY1J&Pv9$F+2EB<6#9+|+9UDNE^F!nwRh#?{eJ$}RsX^M`ODN}lZVwh_zAj@O3kvXC!3B+ zxKHbv7>gE&xrmln9ck*~3a?3_j;z>A1Ygm8L+%gfH7^W~m%I^?<9-?Jt{H6cyz}mq z?#^iQ%Ur2WxuFq`cy1AwP-Fi`_5L4|wCJN-4mF`yDY)0Q_l6@iQ8zVt5H$BsP7AdG`c5aS-@F+Nkbw-g=DTf+>Oh1Ua8-WboU3Ob2fOf;g z_aon32@mPfpe+~A42fYVdGuR9@hd)xZAYgzQValq2>pbYJ)ftOi=Bm&lO3;xjbm(* zI$xnCSy$d1XBDO9`E}+brJXnAsRcQ2!wXceMH6IJrDUXWKFhI=l9LMy4Krg;fe*>} z0U6r+((^s8;N!=9X+c(IsxCbFxac}`3nhw@c+g#9=5z3FX9*n*!@uos8H6ts=F60~ ze07iza@Fu&DpGb>Q($Kkxyefp zvypO@*)^1i9E4?JL=}j-9nT^n61%d{Pq0W;xwq3Wu6|)G_u;#x_k>~C?-1>J9wXF} z66Ix8xx8ajX+x^xsx1dvOt<)M)5(qvNOl^5Y2mNOL?jrFPfX z5WxBH{&3|%6lx3Kly`Q)s`RDYCYc}=c%*N_cV^|Q@{4!oFfd%A{2{9#;o_HUNS27| zc}R(5ctumb#W~x_m!qb?UGB}p{q$q>oknz6qz2#u(aZfKGXA>g z|A~@7G@(J8#y@+MYr(%qi58Y!q9ml?Cn?;g#Z4+oO)_5s6B|_Pgi0-HMqAmrg5Adg z@9?GNf?H90%ZDEdkq9759h{q^nikWC)P9Qw3pP1Eh=FVIsp506A{NUP`@XHRLjjMh z@hh)A7?bYa-sf3J*nO|&zZ{qRy>Dfzv= z4Pm0rWcqlf&%I20z1`|DtAd z8v+s9ZYa^$Kf{EpH5BUR%J;_w@;zSIT3KZba0DcWMU4gl3C?7Jkpf|w069nqE~6Rd zvYtY?n!;=GGB`-19jNr2@zQ{q4$#INXpu-x3>amUP$57dEv&39t-SgMI@$)?+TdnH zdg3HX${f&lamj7Ua1 zOJ}ph&@epopLfi5E6y2+9{!xpOI9=pS$=j|xyqDvTdC8g0TmE|X9nR|+fwGO*zk;< zc6tSx5IEGAvZ4=r0^Bw-!pKBiZ#KpN!Mt(%RUl9rrXU>AO$-opHb^h~EU^ z003Q#-;0DlRODt2b%65yafW;^61MLR#?UB|xALt%vi{1uRc6Q#0xeWm-=>>OI;d2=ew@HwsMU`OmFp$YOk0X? z?P_oJ-%4qnm&3W<4mpyNAi*hFQPkok>VrbU%&$3CvZ^gLiLOABcls#Ss1!fu3qM;k zC?4bJj=c6JJ)d$yB4fCh%$KxythG*4qG%^e=_&TJ)P%>j_P4FKw`*SzTY$f&`aX#C z@XL76HN<|Ld2lN_G?Y&)K>qy8nxSz2IwNU;OF}u5@YthFBQ}2ViVe{(%h&P*aoh7p z2M_H4kDD6IU8gC9T>Pm6pK0H<>FazuQ+^miU>7*181Y%!RRd<=CHu%f!c|y@0Ust^ zdS-?rNqq>9CN~Mu@@oho^8K{f_7*11`KBtwNv|;^-)3VfKQejM4afKr&EeVoI#H{k z^{1qZmqRpb(m44vhzY&jN;yC9B%N4*71XI1fKout5}9z zgX-ljk@6dr>MA@IJJrvVlC()Ez=$QV>BP^7%wg*6Ce>C;t6q7BRq+Psee(72NxEc< z_m}-A`mn|3Q+=U&J!mgoft7jRx7~$z=ide?7Lk$xC*A4iO3xR&@JY`<=RNXA^}~?b zMBKb8yGLaVi{B19=3wVXb0|}>T2E0i$a~T&mhTt3KQ6M6cYx?lcQ3VOBd*CHlBG!2 zJxb{NdO(Y&ESv3zl@9IfwAHcrUJni@mc`Vwd`Ue(Iv zpL@UbA#Ta658Uu%U`-2&C}SmB&&a}-l)3JbL<{>efglXSROjP|E;@&YIrrXNsx;}?KrwbSECi@bRmH6)XE(ah zGj*w~2Id^GkntH$LvAyZo|KA&5z8;ivKjAK-bWH2UJj7zHSv^Wnsz}&kispMUa}Q4 zOHBxKf8JT68z?`fgKf~7>!)q(P4~StR?8k8H3XNBNVI1ZxC(Ym`<|ADNGE{7Pbi9z z3W>UxtGFvaGmGm@`UYdw-hKg_QB6{myUQfrh5ssIc-Mz}w_NiI%NzB5|D8sgeI^~z z2VDMA&!4mHNCX`bEqB_a;(**J;Vt@G9k}=E?zG^0Hottm)*7=7nNC*+Q&UW*GluYi zF`^byyKmq9yvxYHn3uwSf1kp)e$aY_tH1fQ0cJ8lVgfepK};^~KNKlW=4>40QA4^MW_^;$t9HcYnCVaCXREnAL3b_d zi8S<@|1*3A*TfO-6Aqr)%jDqVfRz_?EW|jHyx7r+Xva6f(vPGlE8`H^N3BZ%Uc`0 zo%+;XOnwWBcZ(rZPr7i%haslyEA{FtR}9qw6=_mr^NR2$Q|JI0Sp1AuE%rUcL}nZJ zund0`%cx0(RbBX0_j-RazNi$lGU3Yf&<(uldZU&Q6gD6zU*$D3{1FHmG^Y9 zvq^vyIFl1u0w7^?Z)Wagr!s!1%~W&r7$h~Gr7>1RPjkR3x80pmw)FZNtu6|L1<%)0xv!}m>p5Q8H#1XWEG~q93aE=AyTr03 zEM!~1oi>I0(T(4$xR1zXP7_~tU)8h}Tmc&xG<5LTBTMcEG|pdrEYoyl$U`(Z+`8O1 zSfRpERWZ&SK>0ji08D*Kw;bcyt=*iM6ED|0u9khv*T*ULp{hov^ZIrte4lTDZh!PH6tbO~Yf&gQ zM#d|{^QXA#PU1{#M%IOK3Z*6O>lFF?kSZY|jx7v=;T2kiv&i&d1^H2@x;umUdRy3o zj&_u8rdAFF{MLJCi9&Zc&z% zCJt-erW>dmRb?`mImS^IR3O&ICBYEF3pxfV$YVqk>s0o$+}*G{-WQquf{{g5kk6r< z{W_hPhb$N$wY@zc8Zk5DyWBu>8$fEQ9^FkYrZjt5HC@NZDvpjTwUlcSPoKGu$(TRB zpPSO1X^~|cFcy|tZEW(Mfp@Ci*e|I!v!_&`fg;!PORAI5oB{H^yqERl?A6NILX8AF z7UN4s(FP7uE#2x$yV3c_o>&4n3{1cvJ(P_^R1Iqp}Op*Z)cpe zEbUhgWaZbG z18@<^_43VpilLhn5_X$!N?u35GOL&w-c9c>{y1OOU%Eo`J2oVKh?{sFty4wxLH4I^ zt<7DaR@!bZHjWSeP;u6954^9f4ba}+Ou!@6m&ugJ6agoGY=>YR7;3{!bq|GPn-M$+ z$BHFD1N??@J=J8(fTQowqSuh+$2-24F@KWMA9c-7YWhc2V?PYHMYDPArIUQ5nVV_$ z0e@Gd_OyAT-3Gl>&0~aMS{zh)@95Cs)AVlovE4UPUzPgV(K1&3bBzv+<+CeIe~s3%XDJ8|PoR_e2z6p68W*v8>lHk_xo zbjKO^2>T->5uU-{+jZsCpqcoKRv#|k^os_wur-TMGG&n+l_@6hve~CR)!PcMl$5!( zs&SqLuti@gHCOoRZ;c&@9Yy%cnzPR@sXrx+zjn6+Uv^DphFZ)k*Txo=U182Mw2895 zZ$9N7^V!xqTbG}@N-KUak|P+SWTI3*EnMNIxTGh{H5T65AXI|9%<}#n zZ9nKuW&l_L&N;?UDJm2zW^UIv+GDx4vAI?Msih>koMX&BlU#W2CU>h*`n}zzyw&9Z zr^yWK=ecKj4Lq=Yfn^d6!nsmhPhORMIPI?063_03n~4SAR+l15_YYs5p+oN9&6(ke zaSRf@TG42eCjQl&&7Gb9wPv(p{v6qfs_N({7LHUpjo?7tpi&~u8dF)hW5gyByHGgs ziP=D0xqafymw*)O;p+X9I)>I)GqJLKYKbn40f+VkFQBSA&ijPdm z8s-Wc zoACq^3dV?FtYtPxJU*A9*~>fIGnbSXvL6)clJL?$R&K(V(0ukPGGbJgFSm*aSK-@d zh5rJAM~&F7|wWB7yM#`w@TZ09{16*cSPT z!irAo=&!lhB)JH1k){0vm_oN%&;UO<+eOifB;=>4z||k37b(d_go_m8CqnuUg#V9l zTm<}U#{LNh0A!K_0DsHh7sdY?A%7LOrus$v&vz5_)_aJS00906Q-o0M literal 0 HcmV?d00001 diff --git a/airbyte-integrations/connectors/source-file/source_file/client.py b/airbyte-integrations/connectors/source-file/source_file/client.py index cc6af16d84c7..ddb857403903 100644 --- a/airbyte-integrations/connectors/source-file/source_file/client.py +++ b/airbyte-integrations/connectors/source-file/source_file/client.py @@ -228,7 +228,7 @@ class Client: CSV_CHUNK_SIZE = 10_000 reader_class = URLFile - binary_formats = {"excel", "feather", "parquet", "orc", "pickle"} + binary_formats = {"excel", "excel_binary", "feather", "parquet", "orc", "pickle"} def __init__(self, dataset_name: str, url: str, provider: dict, format: str = None, reader_options: str = None): self._dataset_name = dataset_name @@ -299,6 +299,7 @@ def load_dataframes(self, fp, skip_data=False) -> Iterable: "flat_json": pd.read_json, "html": pd.read_html, "excel": pd.read_excel, + "excel_binary": pd.read_excel, "feather": pd.read_feather, "parquet": pd.read_parquet, "orc": pd.read_orc, @@ -320,7 +321,7 @@ def load_dataframes(self, fp, skip_data=False) -> Iterable: reader_options["index_col"] = 0 yield from reader(fp, **reader_options) - elif self._reader_options == "excel": + elif self._reader_options == "excel_binary": reader_options["engine"] = "pyxlsb" yield from reader(fp, **reader_options) else: diff --git a/airbyte-integrations/connectors/source-file/unit_tests/conftest.py b/airbyte-integrations/connectors/source-file/unit_tests/conftest.py index 6df81f0fc4ed..76eb0981820f 100644 --- a/airbyte-integrations/connectors/source-file/unit_tests/conftest.py +++ b/airbyte-integrations/connectors/source-file/unit_tests/conftest.py @@ -3,6 +3,7 @@ # from pathlib import Path + import pytest from source_file.client import Client diff --git a/airbyte-integrations/connectors/source-file/unit_tests/test_client.py b/airbyte-integrations/connectors/source-file/unit_tests/test_client.py index 81463116fc2a..ee2d849a75a0 100644 --- a/airbyte-integrations/connectors/source-file/unit_tests/test_client.py +++ b/airbyte-integrations/connectors/source-file/unit_tests/test_client.py @@ -2,18 +2,11 @@ # Copyright (c) 2022 Airbyte, Inc., all rights reserved. # -import pytest -from pandas import read_csv -from source_file.client import Client, URLFile, ConfigurationError - +from calendar import c -@pytest.fixture -def client(): - return Client( - dataset_name="test_dataset", - url="scp://test_dataset", - provider={"provider": {"storage": "HTTPS", "reader_impl": "gcsfs", "user_agent": False}}, - ) +import pytest +from pandas import read_csv, read_excel +from source_file.client import Client, ConfigurationError, URLFile @pytest.fixture @@ -58,6 +51,16 @@ def test_load_dataframes(client, wrong_format_client, absolute_path, test_files) next(client.load_dataframes(fp=f, skip_data=True)) +def test_load_dataframes_xlsb(config, absolute_path, test_files): + config["format"] = "excel_binary" + client = Client(**config) + f = f"{absolute_path}/{test_files}/test.xlsb" + read_file = next(client.load_dataframes(fp=f)) + expected = read_excel(f, engine="pyxlsb") + assert read_file.equals(expected) + + + def test_load_nested_json(client, absolute_path, test_files): f = f"{absolute_path}/{test_files}/formats/json/demo.json" with open(f, mode='rb') as file: From 3c3f8e660af713a1266825aa932dfe059f4351ff Mon Sep 17 00:00:00 2001 From: Arsen Losenko Date: Fri, 26 Aug 2022 14:46:32 +0300 Subject: [PATCH 3/6] Update changelog and bump connector version --- airbyte-integrations/connectors/source-file-secure/Dockerfile | 4 ++-- airbyte-integrations/connectors/source-file/Dockerfile | 2 +- docs/integrations/sources/file.md | 3 +++ 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/airbyte-integrations/connectors/source-file-secure/Dockerfile b/airbyte-integrations/connectors/source-file-secure/Dockerfile index 33c8fae0d313..5f46ec28cf82 100644 --- a/airbyte-integrations/connectors/source-file-secure/Dockerfile +++ b/airbyte-integrations/connectors/source-file-secure/Dockerfile @@ -1,4 +1,4 @@ -FROM airbyte/source-file:0.2.20 +FROM airbyte/source-file:0.2.21 WORKDIR /airbyte/integration_code COPY source_file_secure ./source_file_secure @@ -9,5 +9,5 @@ RUN pip install . ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] -LABEL io.airbyte.version=0.2.20 +LABEL io.airbyte.version=0.2.21 LABEL io.airbyte.name=airbyte/source-file-secure diff --git a/airbyte-integrations/connectors/source-file/Dockerfile b/airbyte-integrations/connectors/source-file/Dockerfile index a00507cb5195..3e630de72c37 100644 --- a/airbyte-integrations/connectors/source-file/Dockerfile +++ b/airbyte-integrations/connectors/source-file/Dockerfile @@ -17,5 +17,5 @@ COPY source_file ./source_file ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py" ENTRYPOINT ["python", "/airbyte/integration_code/main.py"] -LABEL io.airbyte.version=0.2.20 +LABEL io.airbyte.version=0.2.21 LABEL io.airbyte.name=airbyte/source-file diff --git a/docs/integrations/sources/file.md b/docs/integrations/sources/file.md index 67dc3a9e1569..6f6558e07f51 100644 --- a/docs/integrations/sources/file.md +++ b/docs/integrations/sources/file.md @@ -86,6 +86,8 @@ In case you select `JSON` format, then options from the [read\_json](https://pan For example, you can use the `{"orient" : "records"}` to change how orientation of data is loaded (if data is `[{column -> value}, … , {column -> value}]`) +If you need to read Excel Binary Workbook, please specify `excel_binary` format in your `config.json` file. + #### Changing data types of source columns Normally, Airbyte tries to infer the data type from the source, but you can use `reader_options` to force specific data types. If you input `{"dtype":"string"}`, all columns will be forced to be parsed as strings. If you only want a specific column to be parsed as a string, simply use `{"dtype" : {"column name": "string"}}`. @@ -127,6 +129,7 @@ In order to read large files from a remote location, this connector uses the [sm | Version | Date | Pull Request | Subject | |---------|------------|----------------------------------------------------------|---------------------------------------------------| +| 0.2.21 | 2022-08-26 | [0000](https://github.com/airbytehq/airbyte/pull/0000) | Specify `pyxlsb` library for Excel Binary Workbook files | 0.2.20 | 2022-08-23 | [15870](https://github.com/airbytehq/airbyte/pull/15870) | Fix CSV schema discovery | | 0.2.19 | 2022-08-19 | [15768](https://github.com/airbytehq/airbyte/pull/15768) | Convert 'nan' to 'null' | | 0.2.18 | 2022-08-16 | [15698](https://github.com/airbytehq/airbyte/pull/15698) | Cache binary stream to file for discover | From 0a90a8e98b246a34ed7fbaedc8caedf74ab1ebeb Mon Sep 17 00:00:00 2001 From: Arsen Losenko Date: Fri, 26 Aug 2022 14:47:48 +0300 Subject: [PATCH 4/6] Update PR link and number --- docs/integrations/sources/file.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/integrations/sources/file.md b/docs/integrations/sources/file.md index 6f6558e07f51..b25d63003dad 100644 --- a/docs/integrations/sources/file.md +++ b/docs/integrations/sources/file.md @@ -129,7 +129,7 @@ In order to read large files from a remote location, this connector uses the [sm | Version | Date | Pull Request | Subject | |---------|------------|----------------------------------------------------------|---------------------------------------------------| -| 0.2.21 | 2022-08-26 | [0000](https://github.com/airbytehq/airbyte/pull/0000) | Specify `pyxlsb` library for Excel Binary Workbook files +| 0.2.21 | 2022-08-26 | [15568](https://github.com/airbytehq/airbyte/pull/15568) | Specify `pyxlsb` library for Excel Binary Workbook files | 0.2.20 | 2022-08-23 | [15870](https://github.com/airbytehq/airbyte/pull/15870) | Fix CSV schema discovery | | 0.2.19 | 2022-08-19 | [15768](https://github.com/airbytehq/airbyte/pull/15768) | Convert 'nan' to 'null' | | 0.2.18 | 2022-08-16 | [15698](https://github.com/airbytehq/airbyte/pull/15698) | Cache binary stream to file for discover | From 997bba7badad64e8af7996e817f071f303b08866 Mon Sep 17 00:00:00 2001 From: Arsen Losenko Date: Fri, 26 Aug 2022 15:13:12 +0300 Subject: [PATCH 5/6] Update spec to include 'excel_binary' option, formatted files --- .../connectors/source-file/source_file/spec.json | 2 +- .../connectors/source-file/unit_tests/conftest.py | 9 ++++++++- .../connectors/source-file/unit_tests/test_client.py | 8 +++----- .../connectors/source-file/unit_tests/test_source.py | 1 - docs/integrations/sources/file.md | 2 +- 5 files changed, 13 insertions(+), 9 deletions(-) diff --git a/airbyte-integrations/connectors/source-file/source_file/spec.json b/airbyte-integrations/connectors/source-file/source_file/spec.json index 161a1219e15a..9af744dc32f6 100644 --- a/airbyte-integrations/connectors/source-file/source_file/spec.json +++ b/airbyte-integrations/connectors/source-file/source_file/spec.json @@ -15,7 +15,7 @@ }, "format": { "type": "string", - "enum": ["csv", "json", "jsonl", "excel", "feather", "parquet", "yaml"], + "enum": ["csv", "json", "jsonl", "excel", "excel_binary", "feather", "parquet", "yaml"], "default": "csv", "title": "File Format", "description": "The Format of the file which should be replicated (Warning: some formats may be experimental, please refer to the docs)." diff --git a/airbyte-integrations/connectors/source-file/unit_tests/conftest.py b/airbyte-integrations/connectors/source-file/unit_tests/conftest.py index 76eb0981820f..6846e5f6fa4a 100644 --- a/airbyte-integrations/connectors/source-file/unit_tests/conftest.py +++ b/airbyte-integrations/connectors/source-file/unit_tests/conftest.py @@ -14,6 +14,7 @@ def _read_file(file_name): parent_location = Path(__file__).absolute().parent file = open(parent_location / file_name).read() return file + return _read_file @@ -24,7 +25,13 @@ def config(): @pytest.fixture def invalid_config(read_file): - return {"dataset_name": "test", "format": "jsonl", "url": "https://airbyte.com", "reader_options":'{"encoding": "encoding"}', "provider": {"storage": "HTTPS"}} + return { + "dataset_name": "test", + "format": "jsonl", + "url": "https://airbyte.com", + "reader_options": '{"encoding": "encoding"}', + "provider": {"storage": "HTTPS"}, + } @pytest.fixture diff --git a/airbyte-integrations/connectors/source-file/unit_tests/test_client.py b/airbyte-integrations/connectors/source-file/unit_tests/test_client.py index ee2d849a75a0..d62c2b005760 100644 --- a/airbyte-integrations/connectors/source-file/unit_tests/test_client.py +++ b/airbyte-integrations/connectors/source-file/unit_tests/test_client.py @@ -2,7 +2,6 @@ # Copyright (c) 2022 Airbyte, Inc., all rights reserved. # -from calendar import c import pytest from pandas import read_csv, read_excel @@ -15,7 +14,7 @@ def wrong_format_client(): dataset_name="test_dataset", url="scp://test_dataset", provider={"provider": {"storage": "HTTPS", "reader_impl": "gcsfs", "user_agent": False}}, - format="wrong" + format="wrong", ) @@ -60,10 +59,9 @@ def test_load_dataframes_xlsb(config, absolute_path, test_files): assert read_file.equals(expected) - def test_load_nested_json(client, absolute_path, test_files): f = f"{absolute_path}/{test_files}/formats/json/demo.json" - with open(f, mode='rb') as file: + with open(f, mode="rb") as file: assert client.load_nested_json(fp=file) @@ -83,7 +81,7 @@ def test_dtype_to_json_type(client, current_type, dtype, expected): def test_cache_stream(client, absolute_path, test_files): f = f"{absolute_path}/{test_files}/test.csv" - with open(f, mode='rb') as file: + with open(f, mode="rb") as file: assert client._cache_stream(file) diff --git a/airbyte-integrations/connectors/source-file/unit_tests/test_source.py b/airbyte-integrations/connectors/source-file/unit_tests/test_source.py index 0ed715e30503..5c29acddc287 100644 --- a/airbyte-integrations/connectors/source-file/unit_tests/test_source.py +++ b/airbyte-integrations/connectors/source-file/unit_tests/test_source.py @@ -20,7 +20,6 @@ SyncMode, Type, ) - from source_file.source import SourceFile logger = logging.getLogger("airbyte") diff --git a/docs/integrations/sources/file.md b/docs/integrations/sources/file.md index b25d63003dad..a222e78c4224 100644 --- a/docs/integrations/sources/file.md +++ b/docs/integrations/sources/file.md @@ -86,7 +86,7 @@ In case you select `JSON` format, then options from the [read\_json](https://pan For example, you can use the `{"orient" : "records"}` to change how orientation of data is loaded (if data is `[{column -> value}, … , {column -> value}]`) -If you need to read Excel Binary Workbook, please specify `excel_binary` format in your `config.json` file. +If you need to read Excel Binary Workbook, please specify `excel_binary` format in `File Format` select. #### Changing data types of source columns From 7b95f5f4527d048a6dde5b1eee8231fbdeb825ad Mon Sep 17 00:00:00 2001 From: Arsen Losenko Date: Fri, 26 Aug 2022 16:28:41 +0300 Subject: [PATCH 6/6] Update spec for source-file-secure to match changes made in source-file spec --- .../connectors/source-file-secure/integration_tests/spec.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte-integrations/connectors/source-file-secure/integration_tests/spec.json b/airbyte-integrations/connectors/source-file-secure/integration_tests/spec.json index cc4872eb81dd..9dc5be7a95e3 100644 --- a/airbyte-integrations/connectors/source-file-secure/integration_tests/spec.json +++ b/airbyte-integrations/connectors/source-file-secure/integration_tests/spec.json @@ -14,7 +14,7 @@ }, "format": { "type": "string", - "enum": ["csv", "json", "jsonl", "excel", "feather", "parquet", "yaml"], + "enum": ["csv", "json", "jsonl", "excel", "excel_binary", "feather", "parquet", "yaml"], "default": "csv", "title": "File Format", "description": "The Format of the file which should be replicated (Warning: some formats may be experimental, please refer to the docs)."