Skip to content

Commit

Permalink
Added (untested) code metadata type shrinking
Browse files Browse the repository at this point in the history
  • Loading branch information
mmcdermott committed Jun 15, 2024
1 parent d0a4a0c commit 2bb2863
Showing 1 changed file with 5 additions and 1 deletion.
6 changes: 5 additions & 1 deletion scripts/preprocessing/collect_code_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import hydra
import polars as pl
import polars.selectors as cs
from loguru import logger
from omegaconf import DictConfig, OmegaConf

Expand Down Expand Up @@ -79,7 +80,10 @@ def main(cfg: DictConfig):
logger.info("All map shards complete! Starting code metadata reduction computation.")
reducer_fn = reducer_fntr(cfg.stage_cfg, cfg.get("code_modifier_columns", None))

reduced = reducer_fn(*[pl.scan_parquet(fp, glob=False) for fp in all_out_fps])
reduced = (
reducer_fn(*[pl.scan_parquet(fp, glob=False) for fp in all_out_fps])
.with_columns(cs.is_numeric().shrink_dtype().keep_name())
)
write_lazyframe(reduced, output_dir / "code_metadata.parquet")
logger.info(f"Finished reduction in {datetime.now() - start}")

Expand Down

0 comments on commit 2bb2863

Please sign in to comment.