Skip to content

Commit

Permalink
docs(website): add support matrix (#102)
Browse files Browse the repository at this point in the history
  • Loading branch information
jitingxu1 authored May 29, 2024
1 parent c6a05e2 commit 92a4a23
Show file tree
Hide file tree
Showing 10 changed files with 492 additions and 19 deletions.
2 changes: 2 additions & 0 deletions docs/_quarto.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ website:
- text: "Tutorial"
href: tutorial/index.ipynb
- sidebar:reference
- text: "Support Matrix"
href: support_matrix.qmd
tools:
- icon: github
menu:
Expand Down
162 changes: 162 additions & 0 deletions docs/step_config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
DiscretizeKBins:
configurations:
- name: uniform
config:
strategy: uniform
n_bins: 5
- name: quantile
config:
strategy: quantile
n_bins: 5

HandleUnivariateOutliers:
configurations:
- name: z-score
config:
method: z-score
- name: IQR
config:
method: IQR

DropZeroVariance:
configurations:
- name: int
config:
inputs: integer
- name: float
config:
inputs: floating
- name: str
config:
inputs: string

FillNA:
configurations:
- name: float
config:
inputs: floating
fill_value: 0
- name: int
config:
inputs: integer
fill_value: 0
- name: str
config:
inputs: string
fill_value: "NULL"

ImputeMode:
configurations:
- name: int
config:
inputs: integer
- name: float
config:
inputs: floating
- name: str
config:
inputs: string

ExpandDate:
configurations:
- name: d
config:
inputs: date
components:
- day
- name: w
config:
inputs: date
components:
- week
- name: m
config:
inputs: date
components:
- month
- name: y
config:
inputs: date
components:
- year
- name: dow
config:
inputs: date
components:
- dow
- name: doy
config:
inputs: date
components:
- doy

ExpandDateTime:
configurations:
- name: ms
config:
inputs: timestamp
components:
- millisecond
- name: s
config:
inputs: timestamp
components:
- second
- name: m
config:
inputs: timestamp
components:
- minute
- name: d
config:
inputs: timestamp
components:
- day
- name: week
config:
inputs: timestamp
components:
- week
- name: mon
config:
inputs: timestamp
components:
- month
- name: y
config:
inputs: timestamp
components:
- year
- name: dow
config:
inputs: timestamp
components:
- dow
- name: doy
config:
inputs: timestamp
components:
- doy

ExpandTime:
configurations:
- name: h
config:
inputs: time
components:
- hour
- name: m
config:
inputs: time
components:
- minute
- name: s
config:
inputs: time
components:
- second
- name: ms
config:
inputs: time
components:
- millisecond
200 changes: 200 additions & 0 deletions docs/support_matrix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
from __future__ import annotations

from datetime import date, datetime, time
from pathlib import Path

import ibis
import numpy as np
import pandas as pd
import yaml

import ibis_ml as ml


def get_leaf_classes(op):
for child_class in op.__subclasses__():
if not child_class.__subclasses__():
yield child_class
else:
yield from get_leaf_classes(child_class)


def check_backend(backend, exprs):
if backend in ("pandas", "polars", "dask"):
try:
con = ibis.connect(f"{backend}://")
for expr in exprs:
con.execute(expr)
return True
except (TypeError, ValueError, AttributeError):
return False
else:
try:
for expr in exprs:
ibis.to_sql(expr, backend)
return True
except ibis.common.exceptions.TranslationError:
return False
except AttributeError:
return False


def make_support_matrix():
all_steps = list(get_leaf_classes(ml.Step))
with Path("./step_config.yml").open() as file:
step_config = yaml.safe_load(file)

expanded_steps = []
for step in all_steps:
step_name = step.__name__
module_category = str(step.__module__).split(".")[-1][1:]
configurations = step_config.get(step_name, {}).get("configurations", [])

if configurations:
expanded_steps.append(
[
{
"name": step_name,
"step_params": config["name"],
"category": module_category,
"params": {
**config["config"],
"inputs": getattr(
ml, config["config"].get("inputs", "numeric")
)(),
},
}
for config in configurations
]
)
else:
expanded_steps.append(
[
{
"name": step_name,
"step_params": "None",
"category": module_category,
"params": {"inputs": ml.numeric()},
}
]
)

backends = sorted(ep.name for ep in ibis.util.backend_entry_points())
alltypes = {
"string": np.array(["a", None, "b"], dtype="str"),
"int": np.array([1, 2, 3], dtype="int64"),
"floating": np.array([1.0, 2.0, 3.0], dtype="float64"),
"date": [date(2017, 4, 2), date(2017, 4, 2), date(2017, 4, 2)],
"time": [time(9, 1, 1), time(10, 1, 11), None],
"datetime": [
datetime(2017, 4, 2, 10, 1, 0),
datetime(2018, 4, 2, 10, 1, 0),
None,
],
"target": np.array([1, 0, 1], dtype="int8"),
}

steps = {"steps": expanded_steps}
unsupported_cols = {"druid": ["time"]}

backend_specific = {
"support": ["backend-specific"],
"not_support": ["backend-specific"],
}
special_step = {
"Drop": {"support": [], "not_support": []},
"Cast": backend_specific,
"MutateAt": backend_specific,
"Mutate": backend_specific,
}

for backend in backends:
results = []
for expand_step in expanded_steps:
res = {"support": [], "not_support": []}
for step_dict in expand_step:
step_name = step_dict["name"]
input_type = type(step_dict["params"]["inputs"]).__name__
if step_name in special_step:
res = special_step[step_name]
continue

if input_type in unsupported_cols.get(backend, []):
res["not_support"].append(step_dict["step_params"])
continue

df = pd.DataFrame(alltypes).drop(
columns=unsupported_cols.get(backend, [])
)
data = ibis.memtable(df)

# construct a step
step = getattr(ml, step_dict["name"])(**step_dict["params"])
metadata = ml.core.Metadata(targets=("target",))
step.fit_table(data, metadata)

all_expr = []
if hasattr(step, "_fit_expr"):
all_expr.extend(step._fit_expr) # noqa: SLF001
output = step.transform_table(data)
all_expr.append(output)

if check_backend(backend, all_expr):
res["support"].append(step_dict["step_params"])
else:
res["not_support"].append(step_dict["step_params"])

if not res["not_support"]:
results.append(True)
elif res["not_support"] and not res["support"]:
results.append(False)
else:
results.append(",".join(set(res["support"])))

steps[backend] = list(results)

support_matrix = (
pd.DataFrame(steps)
.assign(
Category=lambda df: df["steps"].apply(lambda x: x[0]["category"]),
Step=lambda df: df["steps"].apply(lambda x: x[0]["name"]),
)
.drop(["steps"], axis=1)
.set_index(["Category", "Step"])
.sort_index()
)

def count_full(column):
return sum(
1 for value in column if value is True or value == "backend-specific"
)

all_visible_ops_count = len(support_matrix)
fully_coverage = pd.Index(
support_matrix.apply(count_full)
.map(lambda n: f"{n} ({round(100 * n / all_visible_ops_count)}%)")
.T
)

def count_partial(column):
return sum(
1
for value in column
if isinstance(value, str) and value != "backend-specific"
)

partial_coverage = pd.Index(
support_matrix.apply(count_partial)
.map(lambda n: f"{n} ({round(100 * n / all_visible_ops_count)}%)")
.T
)
support_matrix.columns = pd.MultiIndex.from_tuples(
list(zip(support_matrix.columns, fully_coverage, partial_coverage)),
names=("Backend", "Full coverage", "Partial coverage"),
)

return support_matrix


if __name__ == "__main__":
print(make_support_matrix()) # noqa: T201
Loading

0 comments on commit 92a4a23

Please sign in to comment.