Skip to content

Commit

Permalink
Improve data indexing and prompt template for WDI (#16)
Browse files Browse the repository at this point in the history
* Implement db storage of wdi data and fix indicators loading

Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com>

* Add script to create a qdrant index over fields to prevent timeout errors

Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com>

* Update the prompt template for the WDI SQL

Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com>

---------

Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com>
  • Loading branch information
avsolatorio authored Jun 19, 2023
1 parent 38b8dca commit 0ab0e52
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 1 deletion.
8 changes: 8 additions & 0 deletions llm4data/llm/indicators/wdi_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ def load_wdi_jsons(cls, wdi_jsons_dir: Path):
Do bulk insert.
"""

# Drop and recreate the table
cls.__table__.drop(engine, checkfirst=True)
Base.metadata.create_all(engine)

wdi_jsons_dir = Path(wdi_jsons_dir)

if not wdi_jsons_dir.exists() or not wdi_jsons_dir.is_dir():
Expand All @@ -48,6 +52,10 @@ def load_wdi_jsons(cls, wdi_jsons_dir: Path):
session.bulk_save_objects(wdi_objects)
session.commit()

# # Create an index on the indicator column
# psql -U postgres -d wdi
# CREATE INDEX indicator_index ON wdi USING hash (indicator);

@classmethod
def run_sql(
cls,
Expand Down
3 changes: 2 additions & 1 deletion llm4data/prompts/indicators/wdi.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ def __init__(self, input_variables=None, template=None):
" Write an SQL query for the prompt: ```{{{{user_content}}}}```\n\n"
"table: {table}\n"
"fields: {fields}\n\n"
" Use the convention `:param` and not `?`."
"Only the indicator can parameterized and you must fill the rest."
" Use the convention `:indicator` and not `?`."
" Use country_iso3 when querying, use country in the result.\n\n"
"Use the last 10 years if no year is specified."
" Drop rows with no value.\n\n"
Expand Down
46 changes: 46 additions & 0 deletions llm4data/scripts/indexing/create_field_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from llm4data.index import get_docs_index
from llm4data import configs
import fire


def create_doc_index(field_name: str, field_schema: str):
"""
Create a field index for the docs collection.
Args:
field_name (str): The name of the field to index.
field_schema (str): The schema of the field to index.
Returns:
dict: The response from the Qdrant server.
Examples:
>>> from llm4data.scripts.indexing.create_field_index import create_doc_index
>>> from llm4data import configs
>>> create_doc_index("document_description.title_statement.idno", "keyword")
"""

# Check that the field name starts with "document_description." since that is the
# expected metadata key for the docs schema.
assert field_name.startswith("document_description."), f"Field name must start with 'document_description.' but got {field_name}"
doc_index = get_docs_index()

indexed = doc_index.client.create_payload_index(
doc_index.collection_name,
field_name=f"metadata.{configs.METADATA_KEY}.{field_name}",
field_schema=field_schema,
)

return indexed


def main(data_type: str, field_name: str, field_schema: str):
if data_type == "docs":
create_doc_index(field_name, field_schema)
else:
raise ValueError(f"Unknown data type {data_type}")


if __name__ == "__main__":
# python -m llm4data.scripts.indexing.create_field_index --data_type=docs --field_name="document_description.title_statement.idno" --field_schema=keyword
fire.Fire(main)
3 changes: 3 additions & 0 deletions llm4data/scripts/indexing/indicators/load_indicators.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ def load_indicators(collection_dir: Path):
metadata=s.dict(exclude_none=True),
)

with open(indexed_indicators_path, "a+") as f:
f.write(f"{indicator_path}\n")

except KeyboardInterrupt:
raise KeyboardInterrupt

Expand Down

0 comments on commit 0ab0e52

Please sign in to comment.