Skip to content

Commit

Permalink
Add langkit metadata to schema for inclusion in whylogs profiles (#186)
Browse files Browse the repository at this point in the history
  • Loading branch information
jamie256 authored Nov 10, 2023
1 parent 77d9d7d commit 987765b
Show file tree
Hide file tree
Showing 19 changed files with 91 additions and 60 deletions.
6 changes: 4 additions & 2 deletions langkit/all_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
from whylogs.experimental.core.udf_schema import udf_schema
from whylogs.core.schema import DeclarativeSchema

from . import LangKitConfig
from langkit.metadata import attach_schema_metadata

from langkit import LangKitConfig
from langkit import injections
from langkit import topics
from langkit import regexes
Expand All @@ -22,5 +24,5 @@ def init(config: Optional[LangKitConfig] = None) -> DeclarativeSchema:
themes.init(config=config)
toxicity.init(config=config)
input_output.init(config=config)
text_schema = udf_schema()
text_schema = attach_schema_metadata(udf_schema(), "all_metrics")
return text_schema
2 changes: 1 addition & 1 deletion langkit/count_regexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from langkit.pattern_loader import PatternLoader
from whylogs.experimental.core.udf_schema import register_dataset_udf
from . import LangKitConfig, lang_config, prompt_column, response_column
from langkit import LangKitConfig, lang_config, prompt_column, response_column
from whylogs.core.stubs import pd
from typing import Dict, List, Optional, Set, Union

Expand Down
2 changes: 1 addition & 1 deletion langkit/injections.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import Dict, List, Optional, Union
from whylogs.core.stubs import pd
from whylogs.experimental.core.udf_schema import register_dataset_udf
from . import LangKitConfig, lang_config, prompt_column
from langkit import LangKitConfig, lang_config, prompt_column
from sentence_transformers import SentenceTransformer
import requests
from io import BytesIO
Expand Down
2 changes: 1 addition & 1 deletion langkit/input_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from sentence_transformers import util
from whylogs.experimental.core.udf_schema import register_dataset_udf
from . import LangKitConfig, lang_config, prompt_column, response_column
from langkit import LangKitConfig, lang_config, prompt_column, response_column
from langkit.transformer import Encoder

_prompt = prompt_column
Expand Down
5 changes: 3 additions & 2 deletions langkit/light_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
from whylogs.experimental.core.udf_schema import udf_schema
from whylogs.core.schema import DeclarativeSchema

from . import LangKitConfig
from langkit import LangKitConfig
from langkit.metadata import attach_schema_metadata
from langkit import regexes
from langkit import textstat

Expand All @@ -11,5 +12,5 @@ def init(config: Optional[LangKitConfig] = None) -> DeclarativeSchema:
regexes.init(config=config)
textstat.init(config=config)

text_schema = udf_schema()
text_schema = attach_schema_metadata(udf_schema(), "light_metrics")
return text_schema
5 changes: 3 additions & 2 deletions langkit/llm_metrics.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from . import LangKitConfig
from langkit.metadata import attach_schema_metadata
from langkit import LangKitConfig
from logging import getLogger
from typing import Optional
from whylogs.experimental.core.udf_schema import udf_schema
Expand Down Expand Up @@ -27,5 +28,5 @@ def init(config: Optional[LangKitConfig] = None) -> DeclarativeSchema:
toxicity.init(config=config)
input_output.init(config=config)

text_schema = udf_schema()
text_schema = attach_schema_metadata(udf_schema(), "llm_metrics")
return text_schema
41 changes: 41 additions & 0 deletions langkit/metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from langkit import __version__
from logging import getLogger

from typing import Any, Dict, Optional


_LANGKIT_VERSION_METADATA_KEY = "langkit.version"
_LANGKIT_METRIC_COLLECTION_KEY = "langkit.metric_collection"
diagnostic_logger = getLogger(__name__)


def _check_for_metadata(schema: Any) -> Optional[Dict[str, str]]:
if schema is not None and hasattr(schema, "metadata"):
metadata = getattr(schema, "metadata")
if metadata is not None and isinstance(metadata, dict):
return metadata
return None


def _add_langkit_version_metadata(
metadata: Dict[str, str], metric_collection_name: Optional[str]
) -> Dict[str, str]:
if metadata is None:
diagnostic_logger.warning("metadata is None, LangKit won't update metadata")
else:
metadata[_LANGKIT_VERSION_METADATA_KEY] = __version__
if metric_collection_name:
metadata[_LANGKIT_METRIC_COLLECTION_KEY] = metric_collection_name
return metadata


def attach_schema_metadata(schema: Any, metric_collection_name: Optional[str]) -> Any:
metadata = _check_for_metadata(schema)
if metadata is None:
diagnostic_logger.warning(
"schema does not contain metadata, LangKit won't update metadata"
)
return schema
_add_langkit_version_metadata(metadata, metric_collection_name)

return schema
2 changes: 1 addition & 1 deletion langkit/nlp_scores.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import List, Optional, Set
from whylogs.experimental.core.udf_schema import register_dataset_udf
import evaluate
from . import LangKitConfig, lang_config, response_column
from langkit import LangKitConfig, lang_config, response_column
from logging import getLogger


Expand Down
2 changes: 1 addition & 1 deletion langkit/pattern_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from logging import getLogger
from typing import Optional

from . import LangKitConfig, lang_config
from langkit import LangKitConfig, lang_config


diagnostic_logger = getLogger(__name__)
Expand Down
2 changes: 1 addition & 1 deletion langkit/regexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from langkit.pattern_loader import PatternLoader
from whylogs.experimental.core.udf_schema import register_dataset_udf
from . import LangKitConfig, lang_config, prompt_column, response_column
from langkit import LangKitConfig, lang_config, prompt_column, response_column
from whylogs.core.metrics.metrics import FrequentItemsMetric
from whylogs.core.resolvers import MetricSpec
from typing import Dict, List, Optional
Expand Down
2 changes: 1 addition & 1 deletion langkit/sentiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import Optional

from whylogs.experimental.core.udf_schema import register_dataset_udf
from . import LangKitConfig, lang_config, prompt_column, response_column
from langkit import LangKitConfig, lang_config, prompt_column, response_column


_prompt = prompt_column
Expand Down
17 changes: 17 additions & 0 deletions langkit/tests/test_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
def test_metadata_langkit_version():
import whylogs as why
from langkit import __version__
from langkit.metadata import (
_LANGKIT_VERSION_METADATA_KEY,
_LANGKIT_METRIC_COLLECTION_KEY,
)
from langkit import light_metrics # noqa

expected_metric_collection_name = "light_metrics"
text_schema = light_metrics.init()
results = why.log({"prompt": "hello", "response": "goodbye"}, schema=text_schema)
version = results.metadata[_LANGKIT_VERSION_METADATA_KEY]
metric_collection_name = results.metadata[_LANGKIT_METRIC_COLLECTION_KEY]
assert results.metadata
assert version == __version__
assert metric_collection_name == expected_metric_collection_name
7 changes: 5 additions & 2 deletions langkit/textstat.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from logging import getLogger
from typing import Callable, Dict, List, Optional, Tuple, Union
import textstat
from whylogs.core.stubs import pd
from whylogs.experimental.core.udf_schema import register_dataset_udf
from . import LangKitConfig, prompt_column, response_column
from langkit import LangKitConfig, prompt_column, response_column


diagnostic_logger = getLogger(__name__)
Expand Down Expand Up @@ -43,6 +42,8 @@
def wrapper(
stat_name: str, column: str
) -> Callable[[Union[pd.DataFrame, Dict[str, List]]], Union[pd.Series, List]]:
import textstat

stat = textstat.textstat.__getattribute__(stat_name)

def wrappee(text: Union[pd.DataFrame, Dict[str, List]]) -> Union[pd.Series, List]:
Expand All @@ -54,6 +55,8 @@ def wrappee(text: Union[pd.DataFrame, Dict[str, List]]) -> Union[pd.Series, List
def aggregate_wrapper(
column: str,
) -> Callable[[Union[pd.DataFrame, Dict[str, List]]], Union[pd.Series, List]]:
import textstat

stat = textstat.textstat.text_standard

def wrappee(text: Union[pd.DataFrame, Dict[str, List]]) -> Union[pd.Series, List]:
Expand Down
2 changes: 1 addition & 1 deletion langkit/themes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from langkit.transformer import Encoder

from . import LangKitConfig, lang_config, prompt_column, response_column
from langkit import LangKitConfig, lang_config, prompt_column, response_column

diagnostic_logger = getLogger(__name__)

Expand Down
2 changes: 1 addition & 1 deletion langkit/topics.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from transformers import (
pipeline,
)
from . import LangKitConfig, lang_config, prompt_column, response_column
from langkit import LangKitConfig, lang_config, prompt_column, response_column


_topics: List[str] = lang_config.topics
Expand Down
2 changes: 1 addition & 1 deletion langkit/toxicity.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import Optional

from whylogs.experimental.core.udf_schema import register_dataset_udf
from . import LangKitConfig, lang_config, prompt_column, response_column
from langkit import LangKitConfig, lang_config, prompt_column, response_column


_prompt = prompt_column
Expand Down
2 changes: 1 addition & 1 deletion langkit/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
from . import lang_config
from langkit import lang_config


def _get_data_home() -> str:
Expand Down
46 changes: 6 additions & 40 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ readme = "DESCRIPTION.md"
python = ">=3.8,<4"
textstat = "^0.7.3"
pandas = "*"
whylogs = "^1.3.9"
whylogs = "^1.3.13"


# optional dependencies
Expand Down

0 comments on commit 987765b

Please sign in to comment.