Skip to content

Commit

Permalink
Update to frictionless >=5.16
Browse files Browse the repository at this point in the history
  • Loading branch information
amotl committed Oct 20, 2024
1 parent 1688721 commit d16a663
Show file tree
Hide file tree
Showing 7 changed files with 29 additions and 90 deletions.
1 change: 1 addition & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ in progress
- Added support for Python 3.12 and 3.13
- Adjusted SQL DDL for sqlalchemy-cratedb 0.40.0
- Adjusted ddlgenerator wrapper for pandas 2
- Updated to frictionless >=5.16


2023-03-09 0.1.0
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ dependencies = [
"colorama<1",
"crash",
"ddlgenerator<0.2",
"frictionless[excel,json,ods,parquet,sql]<5.19",
"frictionless[excel,json,ods,parquet,sql]>=5.16,<5.19",
"fsspec[gcs,github,http,s3]==2024.9.0",
"json_stream<3",
"line-protocol-parser<2",
Expand Down
28 changes: 17 additions & 11 deletions skeem/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,15 @@ def _ddl_frictionless(self) -> SqlResult:

warnings.filterwarnings("ignore", category=GuessedAtParserWarning)

import frictionless.formats
import sqlalchemy as sa
from ddlgenerator.ddlgenerator import _dump
from frictionless.formats import ExcelControl, OdsControl
from frictionless import Control, Schema
from frictionless.formats import ExcelControl, OdsControl, SqlMapper

from skeem.ddlgen.ddlgenerator import TablePlus
from skeem.frictionless.resource import TableSampleResource

# Sanity checks.
if not self.target.dialect:
raise ValueError("Inferring the database schema needs an SQLAlchemy dialect")

frictionless_args: t.Dict[str, t.Union[str, t.IO]] = {}
if self.resource.path is not None:
Expand All @@ -103,15 +106,15 @@ def _ddl_frictionless(self) -> SqlResult:
raise ValueError("Unable to read any data")

# Define resource controls.
control = None
control: t.Union[Control, None] = None
if self.resource.type is ContentType.ODS:
control = OdsControl(sheet=self.resource.address or 1)
elif self.resource.type is ContentType.XLSX:
control = ExcelControl(sheet=self.resource.address or 1)

# Open resource.
logger.info(f"Opening resource {frictionless_args} with {control}")
resource = frictionless.Resource(**frictionless_args, control=control)
logger.info(f"Opening resource {frictionless_args}. type={self.resource.type}, control={control}")
resource = TableSampleResource(**frictionless_args, control=control) # type: ignore[arg-type]

# When primary key is not given, try to infer it from the data.
# TODO: Make `infer_pk` obtain a `Resource` instance, and/or refactor as method.
Expand All @@ -126,15 +129,14 @@ def _ddl_frictionless(self) -> SqlResult:

# Infer schema.
logger.info("Inferring schema")
engine = sa.create_mock_engine(sa.engine.make_url(f"{self.target.dialect}://"), executor=_dump)
mapper = frictionless.formats.sql.SqlMapper(engine)
mapper = SqlMapper(dialect=self.target.dialect)
descriptor = resource.to_descriptor()

# Either `schema` is already present, or it needs to be established by invoking `describe` first.
if "schema" in descriptor:
schema = frictionless.Schema.from_descriptor(descriptor["schema"])
schema = Schema.from_descriptor(descriptor["schema"])
else:
schema = frictionless.Schema.describe(**frictionless_args, control=control)
schema = Schema.describe(**frictionless_args, control=control)

logger.debug(f"Inferred schema: {schema}")

Expand All @@ -144,6 +146,10 @@ def _ddl_frictionless(self) -> SqlResult:
pk_field.required = True
schema.primary_key = [self.target.primary_key]

# Sanity checks.
if not self.target.table_name:
raise ValueError("Table name must not be empty")

# Create SQLAlchemy table from schema.
logger.info("Converging schema to SQLAlchemy")
table = mapper.write_schema(schema, table_name=self.target.table_name, with_metadata=False)
Expand Down
18 changes: 1 addition & 17 deletions skeem/frictionless/monkey.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,12 @@
from .loader_stream import read_byte_stream_create
from .pandas_plugin import create_parser
from .parser_jsonl import read_cell_stream_create
from .parser_xlsx import read_loader
from .resource import ResourcePlus


def activate():
patch_core()
patch_modules()


def patch_core():
"""
Override sample size for all `frictionless.Resource` instances.
"""
import frictionless

frictionless.resource.Resource = ResourcePlus
frictionless.Resource = ResourcePlus


def patch_modules():
"""
Enhance `frictionless` loader and parser modules.
Expand All @@ -30,13 +17,10 @@ def patch_modules():
"""

import frictionless.formats.excel.parsers
import frictionless.formats.json.parsers
import frictionless.formats.pandas.plugin
import frictionless.schemes.aws.loaders.s3
import frictionless.schemes.stream.loader

frictionless.formats.json.parsers.JsonlParser.read_cell_stream_create = read_cell_stream_create
frictionless.formats.JsonlParser.read_cell_stream_create = read_cell_stream_create
frictionless.formats.pandas.plugin.PandasPlugin.create_parser = create_parser
frictionless.schemes.aws.loaders.s3.S3Loader.read_byte_stream_create = s3_read_byte_stream_create
frictionless.schemes.stream.loader.StreamLoader.read_byte_stream_create = read_byte_stream_create
frictionless.formats.excel.parsers.XlsxParser.read_loader = read_loader
4 changes: 2 additions & 2 deletions skeem/frictionless/parser_jsonl.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import frictionless.formats
from frictionless.formats import JsonlParser

read_cell_stream_create_original = frictionless.formats.json.parsers.JsonlParser.read_cell_stream_create
read_cell_stream_create_original = JsonlParser.read_cell_stream_create


def read_cell_stream_create(self):
Expand Down
52 changes: 0 additions & 52 deletions skeem/frictionless/parser_xlsx.py

This file was deleted.

14 changes: 7 additions & 7 deletions skeem/frictionless/resource.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
from frictionless import Resource
from frictionless.resources import TableResource

from skeem.settings import PEEK_LINES


class ResourcePlus(Resource):
class TableSampleResource(TableResource):
"""
Override sample size for frictionless `Resource` instances.
"""

def __init__(self, *args, **kwargs):
from frictionless import Detector
def __attrs_post_init__(self):
if self.detector is None:
from frictionless import Detector

if "detector" not in kwargs:
kwargs["detector"] = Detector(sample_size=PEEK_LINES)
super().__init__(*args, **kwargs)
self.detector = Detector(sample_size=PEEK_LINES)
super().__attrs_post_init__()

0 comments on commit d16a663

Please sign in to comment.