Integrate indicators indexing and guide (#15)

* Use the schema for context generation Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> * Update guide and add wbdocs metadata migrate script Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> * Add nada_wdi scraper and create secrets dir Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> * Add generation of text for the wdi Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> * Implement indicator indexer scripts Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> --------- Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com>
worldbank · Jun 18, 2023 · 38b8dca · 38b8dca
1 parent 2641e6e
commit 38b8dca
Show file tree

Hide file tree

Showing 9 changed files with 319 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
+secrets/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/GUIDE.md b/GUIDE.md
@@ -14,6 +14,11 @@ For example:
 data/sources/docs/prwp/
     - pdf/
     - metadata/
+
+
+data/sources/indicators/wdi/
+    - text/
+    - metadata/
 ```
 
 ## Content
@@ -37,3 +42,63 @@ python -m llm4data.scripts.indexing.docs.load_docs --path=data/sources/docs/prwp
 ```
 
 This will process the documents and store the vectors generated to the configured vector index.
+
+
+## The World Bank Documents and Reports
+
+The World Bank provides programatic access to open access documents and reports via an API. To index data from the documents and reports, we have to first download the metadata available in the API. Then, we can use the `pdfurl` value in the metadata to scrape the PDFs.
+
+The metadata from the documents and reports API is not using the metadata standard adopted in this library. However, we have implemented a script to easily migrate the metadata from the documents and reports into the metadata standard the library supports. We show below the steps you can take for this migration.
+
+1. Download the metadata and store them in a directory, e.g., `raw_metadata/`. We follow the convention naming the metadata file as `<id>.metadata.json`.
+2. Create the source directory for this data, for example, `data/sources/docs/prwp`.
+3. Use the `pdfurl` to scrape the PDFs.
+4. Store the PDFs in the `pdf` directory under `data/sources/docs/prwp`.
+5. Run the following command to migrate the metadata to the standard adopted in this library:
+
+    ```bash
+    python -m llm4data.schema.docs.migrate_wbdocs_metadata --source_dir=raw_metadata/ --source_dir=data/sources/docs/prwp/metadata
+    ```
+
+6. Run the following command to index the documents:
+
+    ```bash
+    python -m llm4data.scripts.indexing.docs.load_docs --path=data/sources/docs/prwp/pdf --strict
+    ```
+
+## The World Bank World Development Indicators
+
+The World Bank provides programatic access to the World Development Indicators via an API. To index data from the World Development Indicators, we have to first download the metadata.
+
+We use the NADA Dev catalog to download the metadata because it contains metadata for the WDIs that is already in the metadata standard adopted in this library. We have implemented a script to easily download the metadata from the NADA Dev catalog. We show below the steps you can take to download the metadata and index the data.
+
+
+    ```bash
+    python -m scripts.scrapers.indicators.nada_wdi --nada_headers=secrets/nada_headers.json --metadata_dir=data/sources/indicators/wdi/metadata
+    ```
+
+You will need to create the `secrets/nada_headers.json` file. You can get this from the NADA Dev website. You will need to login to the website and then copy the headers from the browser.
+
+    ```{warning}
+    The `secrets` directory is not tracked by git.
+    ```
+
+Once you have downloaded the metadata, you need to create the `text` directory under `data/sources/indicators/wdi`. This directory will contain the text files for each indicator. We will extract relevant information from the metadata and store it in the text files.
+
+    ```bash
+    python -m llm4data.schema.indicators.create_wdi_text --metadata_dir=data/sources/indicators/wdi/metadata
+    ```
+
+Note that the text is taken from the following fields in the metadata:
+
+    - name
+    - definition_long or definition_short
+    - relevance
+    - statistical_concept
+
+
+After creating the text files, you can index the data using the following command:
+
+    ```bash
+    python -m llm4data.scripts.indexing.indicators.load_indicators --collection_dir=data/sources/indicators/wdi
+    ```
diff --git a/llm4data/prompts/context.py b/llm4data/prompts/context.py
@@ -2,7 +2,7 @@
 from llm4data.index import get_docs_index, get_indicators_index
 from llm4data import configs
 from hashlib import md5
-from llm4data.schema.schema2info import get_doc_title, get_doc_authors
+from llm4data.schema.schema2info import get_doc_id, get_doc_title, get_doc_authors
 from langchain.docstore.document import Document
 
 indicators = get_indicators_index()
@@ -26,7 +26,7 @@ def get_page(doc: Document, offset=0, default=-1):
 def get_contexts(prompt: str, k_docs: int = 5, k_indicators: int = 10, doc_id: str = None):
     # Search for documents
     if doc_id is not None:
-        docs_result = docs.similarity_search(prompt, k=k_docs, filter={configs.METADATA_KEY: {"id": doc_id}})
+        docs_result = docs.similarity_search(prompt, k=k_docs, filter={configs.METADATA_KEY: {"document_description": {"title_statement": {"idno": doc_id}}}})
     else:
         docs_result = docs.similarity_search(prompt, k=k_docs)
     indicators_result = indicators.similarity_search(prompt, k=k_indicators)
@@ -38,7 +38,7 @@ def get_contexts(prompt: str, k_docs: int = 5, k_indicators: int = 10, doc_id: s
     indicators_context_records = []
 
     for doc in docs_result:
-        doc_id = doc.metadata[configs.METADATA_KEY]["id"]
+        doc_id = get_doc_id(doc.metadata[configs.METADATA_KEY])
         doc_context.append("<h1>Title: " + get_doc_title(doc.metadata[configs.METADATA_KEY]) + "</h1>")
 
         if doc.metadata[configs.METADATA_KEY].get("authors"):

diff --git a/llm4data/schema/docs/migrate_wbdocs_metadata.py b/llm4data/schema/docs/migrate_wbdocs_metadata.py
@@ -0,0 +1,31 @@
+import json
+from typing import Union, Optional
+from pathlib import Path
+import fire
+from tqdm.auto import tqdm
+from llm4data.schema.docs import wbdocs
+
+
+def main(source_dir: Union[str, Path], target_dir: Optional[Union[str, Path]] = None):
+    source_dir = Path(source_dir)
+    assert source_dir.exists(), f"{source_dir} does not exist"
+
+    if target_dir:
+        target_dir = Path(target_dir)
+        assert target_dir.exists(), f"{target_dir} does not exist"
+
+    for p in tqdm(sorted(source_dir.glob("D*.metadata.json"))):
+        metadata = json.loads(p.read_text())
+        s = wbdocs.WBDocsToSchema(metadata)
+        sc = s.schema()
+
+        if not target_dir:
+            p.write_text(json.dumps(sc.dict(exclude_none=True)))
+        else:
+            target_path = target_dir / p.name
+            target_path.write_text(json.dumps(sc.dict(exclude_none=True)))
+
+
+if __name__ == "__main__":
+    # python -m llm4data.schema.docs.migrate_wbdocs_metadata --source_dir=<source_dir> --target_dir=<target_dir>
+    fire.Fire(main)
diff --git a/llm4data/schema/indicators/create_wdi_text.py b/llm4data/schema/indicators/create_wdi_text.py
@@ -0,0 +1,60 @@
+import json
+from pathlib import Path
+from typing import Union, Optional
+from tqdm.auto import tqdm
+
+import fire
+from metaschema.indicators2 import IndicatorsSchema, SeriesDescription
+
+
+def create_wdi_text(series_description: SeriesDescription):
+    """Create the text for the WDI indicator.
+
+    Args:
+        series_description: The series description object.
+    """
+
+    sd = series_description
+    texts = []
+
+    # Text sources
+
+    name = sd.name
+    definition = sd.definition_long or sd.definition_short or ""
+    dev_relevance = sd.relevance or ""
+    stat_concept = sd.statistical_concept or ""
+
+    if name:
+        texts.append(f"Indicator name: {name}")
+    if definition:
+        texts.append(f"Definition: {definition}")
+    if dev_relevance:
+        texts.append(f"Development relevance: {dev_relevance}")
+    if stat_concept:
+        texts.append(f"Statistical concept and methodology: {stat_concept}")
+
+    # Text
+    text = "\n\n".join(texts)
+    text = text.strip()
+
+    return text
+
+
+def main(metadata_dir: Union[str, Path]):
+    metadata_dir = Path(metadata_dir)
+    assert metadata_dir.exists(), f"{metadata_dir} does not exist"
+
+    for p in tqdm(sorted(metadata_dir.glob("*.json"))):
+        fname = p.parent.parent / "text" / f"{p.stem}.txt"
+        if fname.exists():
+            continue
+
+        metadata = json.loads(p.read_text())
+        s = IndicatorsSchema(**metadata)
+        text = create_wdi_text(s.series_description)
+        fname.write_text(text)
+
+
+if __name__ == "__main__":
+    # python -m llm4data.schema.indicators.create_wdi_text --metadata_dir=data/sources/indicators/wdi/metadata
+    fire.Fire(main)
diff --git a/llm4data/schema/schema2info.py b/llm4data/schema/schema2info.py
@@ -1,6 +1,12 @@
 """This module contains functions to extract information from the schema."""
 
 
+
+def get_doc_id(metadata: dict) -> str:
+    """Get the id of the document from the metadata document_description."""
+    return metadata["document_description"]["title_statement"]["idno"]
+
+
 def get_doc_title(metadata: dict) -> str:
     """Get the title of the document from the metadata document_description."""
     return metadata["document_description"]["title_statement"]["title"]

diff --git a/llm4data/scripts/indexing/indicators/indicators.py b/llm4data/scripts/indexing/indicators/indicators.py
@@ -0,0 +1,28 @@
+from typing import List, Optional, Union
+from langchain.text_splitter import NLTKTextSplitter
+from langchain.docstore.document import Document
+from llm4data import configs
+from llm4data.index import get_indicators_index
+
+
+# Get access to the Qdrant docs collection
+indicators_index = get_indicators_index()
+text_splitter = NLTKTextSplitter()
+
+
+def build_document(text: str, metadata: dict = None):
+    # Load the document
+    document = Document(page_content=text, metadata={configs.METADATA_KEY: metadata} if metadata else {})
+
+    return document
+
+
+def add_indicators(text: Union[str, List[str]], metadata: Optional[Union[dict, List[dict]]] = None):
+    # Load the document
+    if isinstance(text, str):
+        documents = [build_document(text, metadata)]
+    else:
+        documents = [build_document(text, meta) for text, meta in zip(text, metadata)]
+
+    # Add the document to the collection
+    indicators_index.add_documents(documents)
diff --git a/llm4data/scripts/indexing/indicators/load_indicators.py b/llm4data/scripts/indexing/indicators/load_indicators.py
@@ -0,0 +1,80 @@
+"""Given a dump of the WDI indicators contained in WDIEXCEL.xlsx file,
+process them to generate the text for the embedding, extract the metadata
+for the payload, and load them into the vector index.
+"""
+from typing import Union
+from pathlib import Path
+from tqdm.auto import tqdm
+import fire
+from .indicators import add_indicators, indicators_index
+import json
+from metaschema.indicators2 import IndicatorsSchema
+
+
+def load_indicators(collection_dir: Path):
+    """Load the indicators from the collection directory.
+
+    Args:
+        collection_dir (Path): Path to the collection directory.
+    """
+    cname = indicators_index.collection_name
+
+    collection_dir = Path(collection_dir)
+    text_dir = collection_dir / "text"
+    metadata_dir = collection_dir / "metadata"
+    assert text_dir.exists(), f"{text_dir} does not exist."
+    assert metadata_dir.exists(), f"{metadata_dir} does not exist."
+    assert collection_dir.exists(), f"{collection_dir} does not exist."
+
+    indexed_indicators_path = collection_dir / f"indexed_indicators-{cname}.txt"
+    failed_indicators_path = collection_dir / f"failed_indicators-{cname}.txt"
+
+    print("Indexed indicators path:", indexed_indicators_path)
+
+    indexed_indicators = set()
+    if indexed_indicators_path.exists():
+        print("Loading indexed indicators...")
+        with open(indexed_indicators_path, "r") as f:
+            for line in f:
+                indexed_indicators.add(line.strip())
+
+    print("Indexed indicators:", len(indexed_indicators))
+
+
+    for indicator_path in tqdm(sorted(text_dir.glob("*.txt"))):
+        if str(indicator_path) in indexed_indicators:
+            continue
+        metadata_path = metadata_dir / f"{indicator_path.stem}.json"
+
+        try:
+            metadata = json.loads(metadata_path.read_text())
+            text = indicator_path.read_text()
+
+            s = IndicatorsSchema(**metadata)
+
+            add_indicators(
+                text=text,
+                metadata=s.dict(exclude_none=True),
+            )
+
+        except KeyboardInterrupt:
+            raise KeyboardInterrupt
+
+        except Exception as e:
+            with open(failed_indicators_path, "a+") as f:
+                f.write(f"{indicator_path}\t{e}\n")
+            continue
+
+
+def main(collection_dir: Union[str, Path]):
+
+    collection_dir = Path(collection_dir).expanduser()
+    assert collection_dir.exists(), f"File {collection_dir} does not exist."
+
+    print(f"Loading indicators from {collection_dir}...")
+    load_indicators(collection_dir)
+
+
+if __name__ == "__main__":
+    # python -m llm4data.scripts.indexing.indicators.load_indicators --collection_dir=data/sources/indicators/wdi
+    fire.Fire(main)
diff --git a/scripts/scrapers/indicators/nada_wdi.py b/scripts/scrapers/indicators/nada_wdi.py
@@ -0,0 +1,44 @@
+from typing import Union
+import json
+import requests
+from pathlib import Path
+import fire
+from tqdm.auto import tqdm
+
+from metaschema.indicators2 import IndicatorsSchema
+
+
+def main(nada_headers: Union[str, Path], metadata_dir: Union[str, Path], max_id: int = 2000):
+    """
+    Args:
+        nada_headers: Path to the NADA headers file in JSON format. You can get this from the Network tab in your browser.
+        metadata_dir: Path to the directory where the metadata files will be saved.
+    """
+    nada_headers = Path(nada_headers)
+    assert nada_headers.exists(), f"{nada_headers} does not exist"
+
+    metadata_dir = Path(metadata_dir)
+    assert metadata_dir.exists(), f"{metadata_dir} does not exist"
+
+    url = lambda no: f"https://dev.ihsn.org/wdi/index.php/metadata/export/{no}/json"
+    headers = json.loads(nada_headers.read_text())
+
+    for no in tqdm(range(1, max_id + 1), desc="Scraping", position=1):
+
+        r = requests.get(url(no), headers=headers)
+        if r.status_code == 200:
+            data = r.json()
+            if not data:
+                break
+
+            data = IndicatorsSchema(**data)
+
+            filename = metadata_dir / f"{data.series_description.idno}.json"
+            filename.write_text(json.dumps(data.dict(exclude_none=True), default=str))
+        else:
+            break
+
+
+if __name__ == "__main__":
+    # python -m scripts.scrapers.indicators.nada_wdi --nada_headers=secrets/nada_headers.json --metadata_dir=data/sources/indicators/wdi/metadata
+    fire.Fire(main)