Skip to content

Commit

Permalink
Merge pull request #17 from Parexel/tests-for-get-unique-values
Browse files Browse the repository at this point in the history
Tests for get unique values
  • Loading branch information
CimimUxMaio authored Nov 30, 2022
2 parents 562edf1 + 2091804 commit 6b12814
Show file tree
Hide file tree
Showing 6 changed files with 296 additions and 80 deletions.
4 changes: 3 additions & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@ name = "pypi"

[packages]
ijson = "*"
pytest = "*"

[dev-packages]
autopep8 = "*"
flake8 = "*"
pytest = "*"
psutil = "*"
black = "*"

[requires]
python_version = "3.10"
python_version = "3.10.7"
159 changes: 132 additions & 27 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion examples/simple_dataset.json
Original file line number Diff line number Diff line change
Expand Up @@ -62,4 +62,4 @@
}
}
}
}
}
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = stream-dataset-json
version = 0.1.4
version = 0.1.5
description = Package to stream-read large dataset JSON files.
long_description_content_type = text/markdown
url = https://github.com/Parexel/stream-dataset-json
Expand Down
61 changes: 33 additions & 28 deletions streamdatasetjson/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
import ijson


Row = NewType("Row", 'list[str]')
Row = NewType("Row", "list[str]")
JSONFileObject = NewType("JSONFileObject", TextIOWrapper)


class Item(NamedTuple):
oid: str
name: str
label: str
type: str
oid: str
name: str
label: str
type: str
length: Optional[int]


Expand All @@ -30,7 +30,6 @@ def __init__(self, dataset_json_file: JSONFileObject, name: str, prefix: str):
self._records = None
self._label = None
self._items = []
self._uniques={}

self._df.seek(0)
item, item_key = None, None
Expand All @@ -56,29 +55,35 @@ def __init__(self, dataset_json_file: JSONFileObject, name: str, prefix: str):
item_key = None

def _raw_to_item(self, raw_item: dict) -> Item:
return Item(oid=raw_item["OID"],
name=raw_item["name"],
label=raw_item["label"],
type=raw_item["type"],
length=raw_item.get("length", None))

def getUniqueValues(self, variable_names: list[str], rows_to_scan: int = 0) -> dict[str, list[str]]:
unique={}

for colname in self._items:
unique[colname]=set([])

for record in self._records:
if rows_to_scan != 0 and scanned_rows>=rows_to_scan:
return Item(
oid=raw_item["OID"],
name=raw_item["name"],
label=raw_item["label"],
type=raw_item["type"],
length=raw_item.get("length", None),
)

def get_unique_values(
self, variables: "list[str]", rows_to_scan: int = 0
) -> "dict[str, set]":
uniques = {name: set() for name in variables}

scanned_rows = 0
for row in self.observations:
if rows_to_scan != 0 and scanned_rows >= rows_to_scan:
break
scanned_rows=scanned_rows+1
for variable,value in zip(self._items,record):
unique[variable].add(value)

for columns in self._items:
unique[colname]=list(unique[colname])
target_item_values = [
(item.name, value)
for item, value in zip(self.items, row)
if item.name in variables
]
for variable, value in target_item_values:
uniques[variable].add(value)

scanned_rows = scanned_rows + 1

return unique
return uniques

@property
def name(self) -> str:
Expand All @@ -97,9 +102,9 @@ def label(self) -> str:
return self._label

@property
def items(self) -> 'list[Item]':
def items(self) -> "list[Item]":
return self._items

@property
def variables(self) -> 'list[str]':
def variables(self) -> "list[str]":
return [meta.name for meta in self.items]
Loading

0 comments on commit 6b12814

Please sign in to comment.