-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Integrate indicators indexing and guide (#15)
* Use the schema for context generation Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> * Update guide and add wbdocs metadata migrate script Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> * Add nada_wdi scraper and create secrets dir Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> * Add generation of text for the wdi Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> * Implement indicator indexer scripts Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> --------- Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com>
- Loading branch information
1 parent
2641e6e
commit 38b8dca
Showing
9 changed files
with
319 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,5 @@ | ||
secrets/ | ||
|
||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
*.py[cod] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
import json | ||
from typing import Union, Optional | ||
from pathlib import Path | ||
import fire | ||
from tqdm.auto import tqdm | ||
from llm4data.schema.docs import wbdocs | ||
|
||
|
||
def main(source_dir: Union[str, Path], target_dir: Optional[Union[str, Path]] = None): | ||
source_dir = Path(source_dir) | ||
assert source_dir.exists(), f"{source_dir} does not exist" | ||
|
||
if target_dir: | ||
target_dir = Path(target_dir) | ||
assert target_dir.exists(), f"{target_dir} does not exist" | ||
|
||
for p in tqdm(sorted(source_dir.glob("D*.metadata.json"))): | ||
metadata = json.loads(p.read_text()) | ||
s = wbdocs.WBDocsToSchema(metadata) | ||
sc = s.schema() | ||
|
||
if not target_dir: | ||
p.write_text(json.dumps(sc.dict(exclude_none=True))) | ||
else: | ||
target_path = target_dir / p.name | ||
target_path.write_text(json.dumps(sc.dict(exclude_none=True))) | ||
|
||
|
||
if __name__ == "__main__": | ||
# python -m llm4data.schema.docs.migrate_wbdocs_metadata --source_dir=<source_dir> --target_dir=<target_dir> | ||
fire.Fire(main) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
import json | ||
from pathlib import Path | ||
from typing import Union, Optional | ||
from tqdm.auto import tqdm | ||
|
||
import fire | ||
from metaschema.indicators2 import IndicatorsSchema, SeriesDescription | ||
|
||
|
||
def create_wdi_text(series_description: SeriesDescription): | ||
"""Create the text for the WDI indicator. | ||
Args: | ||
series_description: The series description object. | ||
""" | ||
|
||
sd = series_description | ||
texts = [] | ||
|
||
# Text sources | ||
|
||
name = sd.name | ||
definition = sd.definition_long or sd.definition_short or "" | ||
dev_relevance = sd.relevance or "" | ||
stat_concept = sd.statistical_concept or "" | ||
|
||
if name: | ||
texts.append(f"Indicator name: {name}") | ||
if definition: | ||
texts.append(f"Definition: {definition}") | ||
if dev_relevance: | ||
texts.append(f"Development relevance: {dev_relevance}") | ||
if stat_concept: | ||
texts.append(f"Statistical concept and methodology: {stat_concept}") | ||
|
||
# Text | ||
text = "\n\n".join(texts) | ||
text = text.strip() | ||
|
||
return text | ||
|
||
|
||
def main(metadata_dir: Union[str, Path]): | ||
metadata_dir = Path(metadata_dir) | ||
assert metadata_dir.exists(), f"{metadata_dir} does not exist" | ||
|
||
for p in tqdm(sorted(metadata_dir.glob("*.json"))): | ||
fname = p.parent.parent / "text" / f"{p.stem}.txt" | ||
if fname.exists(): | ||
continue | ||
|
||
metadata = json.loads(p.read_text()) | ||
s = IndicatorsSchema(**metadata) | ||
text = create_wdi_text(s.series_description) | ||
fname.write_text(text) | ||
|
||
|
||
if __name__ == "__main__": | ||
# python -m llm4data.schema.indicators.create_wdi_text --metadata_dir=data/sources/indicators/wdi/metadata | ||
fire.Fire(main) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
from typing import List, Optional, Union | ||
from langchain.text_splitter import NLTKTextSplitter | ||
from langchain.docstore.document import Document | ||
from llm4data import configs | ||
from llm4data.index import get_indicators_index | ||
|
||
|
||
# Get access to the Qdrant docs collection | ||
indicators_index = get_indicators_index() | ||
text_splitter = NLTKTextSplitter() | ||
|
||
|
||
def build_document(text: str, metadata: dict = None): | ||
# Load the document | ||
document = Document(page_content=text, metadata={configs.METADATA_KEY: metadata} if metadata else {}) | ||
|
||
return document | ||
|
||
|
||
def add_indicators(text: Union[str, List[str]], metadata: Optional[Union[dict, List[dict]]] = None): | ||
# Load the document | ||
if isinstance(text, str): | ||
documents = [build_document(text, metadata)] | ||
else: | ||
documents = [build_document(text, meta) for text, meta in zip(text, metadata)] | ||
|
||
# Add the document to the collection | ||
indicators_index.add_documents(documents) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
"""Given a dump of the WDI indicators contained in WDIEXCEL.xlsx file, | ||
process them to generate the text for the embedding, extract the metadata | ||
for the payload, and load them into the vector index. | ||
""" | ||
from typing import Union | ||
from pathlib import Path | ||
from tqdm.auto import tqdm | ||
import fire | ||
from .indicators import add_indicators, indicators_index | ||
import json | ||
from metaschema.indicators2 import IndicatorsSchema | ||
|
||
|
||
def load_indicators(collection_dir: Path): | ||
"""Load the indicators from the collection directory. | ||
Args: | ||
collection_dir (Path): Path to the collection directory. | ||
""" | ||
cname = indicators_index.collection_name | ||
|
||
collection_dir = Path(collection_dir) | ||
text_dir = collection_dir / "text" | ||
metadata_dir = collection_dir / "metadata" | ||
assert text_dir.exists(), f"{text_dir} does not exist." | ||
assert metadata_dir.exists(), f"{metadata_dir} does not exist." | ||
assert collection_dir.exists(), f"{collection_dir} does not exist." | ||
|
||
indexed_indicators_path = collection_dir / f"indexed_indicators-{cname}.txt" | ||
failed_indicators_path = collection_dir / f"failed_indicators-{cname}.txt" | ||
|
||
print("Indexed indicators path:", indexed_indicators_path) | ||
|
||
indexed_indicators = set() | ||
if indexed_indicators_path.exists(): | ||
print("Loading indexed indicators...") | ||
with open(indexed_indicators_path, "r") as f: | ||
for line in f: | ||
indexed_indicators.add(line.strip()) | ||
|
||
print("Indexed indicators:", len(indexed_indicators)) | ||
|
||
|
||
for indicator_path in tqdm(sorted(text_dir.glob("*.txt"))): | ||
if str(indicator_path) in indexed_indicators: | ||
continue | ||
metadata_path = metadata_dir / f"{indicator_path.stem}.json" | ||
|
||
try: | ||
metadata = json.loads(metadata_path.read_text()) | ||
text = indicator_path.read_text() | ||
|
||
s = IndicatorsSchema(**metadata) | ||
|
||
add_indicators( | ||
text=text, | ||
metadata=s.dict(exclude_none=True), | ||
) | ||
|
||
except KeyboardInterrupt: | ||
raise KeyboardInterrupt | ||
|
||
except Exception as e: | ||
with open(failed_indicators_path, "a+") as f: | ||
f.write(f"{indicator_path}\t{e}\n") | ||
continue | ||
|
||
|
||
def main(collection_dir: Union[str, Path]): | ||
|
||
collection_dir = Path(collection_dir).expanduser() | ||
assert collection_dir.exists(), f"File {collection_dir} does not exist." | ||
|
||
print(f"Loading indicators from {collection_dir}...") | ||
load_indicators(collection_dir) | ||
|
||
|
||
if __name__ == "__main__": | ||
# python -m llm4data.scripts.indexing.indicators.load_indicators --collection_dir=data/sources/indicators/wdi | ||
fire.Fire(main) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
from typing import Union | ||
import json | ||
import requests | ||
from pathlib import Path | ||
import fire | ||
from tqdm.auto import tqdm | ||
|
||
from metaschema.indicators2 import IndicatorsSchema | ||
|
||
|
||
def main(nada_headers: Union[str, Path], metadata_dir: Union[str, Path], max_id: int = 2000): | ||
""" | ||
Args: | ||
nada_headers: Path to the NADA headers file in JSON format. You can get this from the Network tab in your browser. | ||
metadata_dir: Path to the directory where the metadata files will be saved. | ||
""" | ||
nada_headers = Path(nada_headers) | ||
assert nada_headers.exists(), f"{nada_headers} does not exist" | ||
|
||
metadata_dir = Path(metadata_dir) | ||
assert metadata_dir.exists(), f"{metadata_dir} does not exist" | ||
|
||
url = lambda no: f"https://dev.ihsn.org/wdi/index.php/metadata/export/{no}/json" | ||
headers = json.loads(nada_headers.read_text()) | ||
|
||
for no in tqdm(range(1, max_id + 1), desc="Scraping", position=1): | ||
|
||
r = requests.get(url(no), headers=headers) | ||
if r.status_code == 200: | ||
data = r.json() | ||
if not data: | ||
break | ||
|
||
data = IndicatorsSchema(**data) | ||
|
||
filename = metadata_dir / f"{data.series_description.idno}.json" | ||
filename.write_text(json.dumps(data.dict(exclude_none=True), default=str)) | ||
else: | ||
break | ||
|
||
|
||
if __name__ == "__main__": | ||
# python -m scripts.scrapers.indicators.nada_wdi --nada_headers=secrets/nada_headers.json --metadata_dir=data/sources/indicators/wdi/metadata | ||
fire.Fire(main) |