diff --git a/scripts/preprocessing/collect_code_metadata.py b/scripts/preprocessing/collect_code_metadata.py index 514e871..191894f 100755 --- a/scripts/preprocessing/collect_code_metadata.py +++ b/scripts/preprocessing/collect_code_metadata.py @@ -8,6 +8,7 @@ import hydra import polars as pl +import polars.selectors as cs from loguru import logger from omegaconf import DictConfig, OmegaConf @@ -79,7 +80,10 @@ def main(cfg: DictConfig): logger.info("All map shards complete! Starting code metadata reduction computation.") reducer_fn = reducer_fntr(cfg.stage_cfg, cfg.get("code_modifier_columns", None)) - reduced = reducer_fn(*[pl.scan_parquet(fp, glob=False) for fp in all_out_fps]) + reduced = ( + reducer_fn(*[pl.scan_parquet(fp, glob=False) for fp in all_out_fps]) + .with_columns(cs.is_numeric().shrink_dtype().keep_name()) + ) write_lazyframe(reduced, output_dir / "code_metadata.parquet") logger.info(f"Finished reduction in {datetime.now() - start}")