ludwig-ai · ethanreidel · Oct 25, 2024 · Oct 26, 2024 · Oct 28, 2024 · Oct 29, 2024
@@ -50,7 +50,7 @@ jobs:
           python --version
           pip --version
           python -m pip install -U pip
-          pip install torch==2.1.0 torchtext torchvision torchaudio
+          pip install torch==2.1.0 torchvision torchaudio
           pip install ray==2.3.1
           pip install '.[test]'
 

@@ -140,3 +140,5 @@ examples/*/visualizations/
 
 # benchmarking configs
 ludwig/benchmarking/configs/
+pytest.xml
+ludwig.code-workspace
@@ -34,7 +34,7 @@ repos:
       - id: pyupgrade
         args: [--py36-plus]
   - repo: https://github.com/PyCQA/docformatter
-    rev: v1.5.1
+    rev: 06907d0
     hooks:
       - id: docformatter
         args: [--in-place, --wrap-summaries=115, --wrap-descriptions=120]

@@ -50,7 +50,7 @@ RUN pip install -U pip
 
 WORKDIR /ludwig
 
-RUN pip install --no-cache-dir torch==2.1.0 torchtext torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
+RUN pip install --no-cache-dir torch==2.1.0  torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
 
 COPY . .
 RUN pip install --no-cache-dir '.[full]' --extra-index-url https://download.pytorch.org/whl/cu118
@@ -36,7 +36,7 @@ RUN pip install -U pip
 
 WORKDIR /ludwig
 
-RUN pip install --no-cache-dir torch==2.1.0 torchtext torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
+RUN pip install --no-cache-dir torch==2.1.0 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
 
 COPY . .
 RUN pip install --no-cache-dir '.[full]' --extra-index-url https://download.pytorch.org/whl/cpu
@@ -24,7 +24,7 @@ RUN pip install -U pip
 
 WORKDIR /ludwig
 
-RUN pip install --no-cache-dir torch==2.0.0 torchtext torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
+RUN pip install --no-cache-dir torch==2.0.0 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
 
 COPY . .
 RUN pip install --no-cache-dir '.[full]'

@@ -0,0 +1,17 @@
+FROM python:3.12
+
+ENV PATH="/root/.local/bin:$PATH"
+RUN apt-get -y update
+RUN apt-get -y install pipx
+RUN apt-get -y install git libsndfile1 build-essential g++ cmake ffmpeg sox libsox-dev
+RUN pipx ensurepath --force
+RUN pipx install hatch
+RUN python3 -m pip install --upgrade pipx
+WORKDIR /ludwig
+#COPY /ludwig/ .
+COPY . .
+
+RUN hatch env create
+RUN hatch build
+
+ENTRYPOINT ["ludwig"]
@@ -0,0 +1 @@
+__version__ = "1.13.0"
@@ -2015,9 +2015,9 @@ def to_torchscript(
         # Inputs
 
         :param  model_only (bool, optional): If True, only the ECD model will be converted to Torchscript. Else,
-            preprocessing and postprocessing steps will also be converted to Torchscript.
-        :param device (TorchDevice, optional): If None, the model will be converted to Torchscript on the same device to
-            ensure maximum model parity.
+        preprocessing and postprocessing steps will also be converted to Torchscript. :param device (TorchDevice,
+        optional): If None, the model will be converted to Torchscript on the same device to     ensure maximum model
+        parity.
 
         # Returns
 
@@ -2086,11 +2086,8 @@ def create_model(config_obj: Union[ModelConfig, dict], random_seed: int = defaul
 
         # Inputs
         :param config_obj: (Union[Config, dict]) Ludwig config object
-        :param random_seed: (int, default: ludwig default random seed) Random
-            seed used for weights initialization,
-            splits and any other random function.
-
-        # Return
+        :param random_seed: (int, default: ludwig default random seed) Random seed used for weights initialization,
+            splits and any other random function. # Return
         :return: (ludwig.models.BaseModel) Instance of the Ludwig model object.
         """
         if isinstance(config_obj, dict):
@@ -2136,7 +2133,7 @@ def is_merge_and_unload_set(self) -> bool:
 
         # Return
 
-            :return (bool): whether merge_and_unload should be done.
+        :return (bool): whether merge_and_unload should be done.
         """
         # TODO: In the future, it may be possible to move up the model type check into the BaseModel class.
         return self.config_obj.model_type == MODEL_LLM and self.model.is_merge_and_unload_set()

@@ -79,9 +79,8 @@ class DatasetInfo:
 def allocate_experiment_resources(resources: Resources) -> dict:
     """Allocates ray trial resources based on available resources.
 
-    # Inputs
-    :param resources (dict) specifies all available GPUs, CPUs and associated
-        metadata of the machines (i.e. memory)
+    # Inputs :param resources (dict) specifies all available GPUs, CPUs and associated     metadata of the machines
+    (i.e. memory)
 
     # Return
     :return: (dict) gpu and cpu resources per trial
@@ -260,9 +259,7 @@ def get_dataset_info(df: Union[pd.DataFrame, dd.core.DataFrame]) -> DatasetInfo:
     inference.
 
     # Inputs
-    :param df: (Union[pd.DataFrame, dd.core.DataFrame]) Pandas or Dask dataframe.
-
-    # Return
+    :param df: (Union[pd.DataFrame, dd.core.DataFrame]) Pandas or Dask dataframe.  # Return
     :return: (DatasetInfo) Structure containing list of FieldInfo objects.
     """
     source = wrap_data_source(df)
@@ -297,9 +294,7 @@ def get_dataset_info_from_source(source: DataSource) -> DatasetInfo:
     inference.
 
     # Inputs
-    :param source: (DataSource) A wrapper around a data source, which may represent a pandas or Dask dataframe.
-
-    # Return
+    :param source: (DataSource) A wrapper around a data source, which may represent a pandas or Dask dataframe. # Return
     :return: (DatasetInfo) Structure containing list of FieldInfo objects.
     """
     row_count = len(source)
@@ -355,10 +350,8 @@ def get_features_config(
 
     # Inputs
     :param fields: (List[FieldInfo]) FieldInfo objects for all fields in dataset
-    :param row_count: (int) total number of entries in original dataset
-    :param target_name (str, List[str]) name of target feature
-
-    # Return
+    :param row_count: (int) total number of entries in original dataset :param target_name (str, List[str]) name of
+        target feature # Return
     :return: (dict) section of auto_train config for input_features and output_features
     """
     targets = convert_targets(target_name)
@@ -379,10 +372,8 @@ def get_config_from_metadata(metadata: List[FieldMetadata], targets: Set[str] =
     """Builds input/output feature sections of auto-train config using field metadata.
 
     # Inputs
-    :param metadata: (List[FieldMetadata]) field descriptions
-    :param targets (Set[str]) names of target features
-
-    # Return
+    :param metadata: (List[FieldMetadata]) field descriptions :param targets (Set[str]) names of target features #
+        Return
     :return: (dict) section of auto_train config for input_features and output_features
     """
     config = {
@@ -405,10 +396,8 @@ def get_field_metadata(fields: List[FieldInfo], row_count: int, targets: Set[str
 
     # Inputs
     :param fields: (List[FieldInfo]) FieldInfo objects for all fields in dataset
-    :param row_count: (int) total number of entries in original dataset
-    :param targets (Set[str]) names of target features
-
-    # Return
+    :param row_count: (int) total number of entries in original dataset :param targets (Set[str]) names of target
+        features # Return
     :return: (List[FieldMetadata]) list of objects containing metadata for each field
     """
 

@@ -19,8 +19,8 @@
 class TunerRay210(Tuner):
     """HACK(geoffrey): This is a temporary fix to support Ray 2.1.0.
 
-    Specifically, this Tuner ensures that TunerInternalRay210 is called by the class.
-    For more details, see TunerInternalRay210.
+    Specifically, this Tuner ensures that TunerInternalRay210 is called by the class. For more details, see
+    TunerInternalRay210.
     """
 
     def __init__(
@@ -120,8 +120,9 @@ def restore(
 class TunerInternalRay210(TunerInternal):
     """HACK(geoffrey): This is a temporary fix to support Ray 2.1.0.
 
-    This TunerInternal ensures that a division by zero is avoided when running zero-CPU hyperopt trials.
-    This is fixed in ray>=2.2 (but not ray<=2.1) here: https://github.com/ray-project/ray/pull/30598
+    This TunerInternal ensures that a division by zero is avoided when running zero-CPU hyperopt trials. This is fixed
+    in ray>=2.2 (but not ray<=2.1) here:
+    https://github.com/ray-project/ray/pull/30598
     """
 
     def _expected_utilization(self, cpus_per_trial, cpus_total):

@@ -88,8 +88,8 @@ def _open_input_source(
 
         The default implementation opens the source path as a sequential input stream.
 
-        Implementations that do not support streaming reads (e.g. that require random
-        access) should override this method.
+        Implementations that do not support streaming reads (e.g. that require random access) should override this
+        method.
         """
         if path is None or is_http(path):
             return contextlib.nullcontext()

@@ -17,7 +17,7 @@ def __init__(
         fp16: Optional[Dict[str, Any]] = None,
         bf16: Optional[Dict[str, Any]] = None,
         compression_training: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.zero_optimization = zero_optimization

@@ -8,7 +8,8 @@
 import ludwig.modules.metric_modules  # noqa: F401
 from ludwig.benchmarking.utils import format_memory, format_time
 from ludwig.globals import MODEL_FILE_NAME, MODEL_HYPERPARAMETERS_FILE_NAME
-from ludwig.modules.metric_registry import get_metric_classes, metric_feature_type_registry  # noqa: F401
+from ludwig.modules.metric_registry import get_metric_classes  # noqa: F401
+from ludwig.modules.metric_registry import metric_feature_type_registry
 from ludwig.types import ModelConfigDict
 from ludwig.utils.data_utils import load_json
 

@@ -48,7 +48,7 @@ def on_preprocess_end(self, training_set, validation_set, test_set, training_set
         :param test_set: The test set.
         :type test_set: ludwig.dataset.base.Dataset
         :param training_set_metadata: Values inferred from the training set, including preprocessing settings,
-                                      vocabularies, feature statistics, etc. Same as training_set_metadata.json.
+            vocabularies, feature statistics, etc. Same as training_set_metadata.json.
         """
 
         pass
@@ -374,7 +374,6 @@ def prepare_ray_tune(self, train_fn: Callable, tune_config: Dict[str, Any], tune
         :param train_fn: The function which runs the experiment trial.
         :param tune_config: The ray tune configuration dictionary.
         :param tune_callbacks: List of callbacks (not used yet).
-
         :returns: Tuple[Callable, Dict] The train_fn and tune_config, which will be passed to ray tune.
         """
         return train_fn, tune_config
@@ -358,7 +358,8 @@ def check_hyperopt_parameter_dicts(config: "ModelConfig") -> None:  # noqa: F821
     if config.hyperopt is None:
         return
 
-    from ludwig.schema.hyperopt.utils import get_parameter_cls, parameter_config_registry  # noqa: F401
+    from ludwig.schema.hyperopt.utils import get_parameter_cls  # noqa: F401
+    from ludwig.schema.hyperopt.utils import parameter_config_registry
 
     for parameter, space in config.hyperopt.parameters.items():
         # skip nested hyperopt parameters

@@ -11,9 +11,11 @@
 
 # TODO(travis): figure out why we need these imports to avoid circular import error
 from ludwig.schema.combiners.utils import get_combiner_jsonschema  # noqa
-from ludwig.schema.features.utils import get_input_feature_jsonschema, get_output_feature_jsonschema  # noqa
+from ludwig.schema.features.utils import get_input_feature_jsonschema  # noqa
+from ludwig.schema.features.utils import get_output_feature_jsonschema
 from ludwig.schema.hyperopt import get_hyperopt_jsonschema  # noqa
-from ludwig.schema.trainer import get_model_type_jsonschema, get_trainer_jsonschema  # noqa
+from ludwig.schema.trainer import get_model_type_jsonschema  # noqa
+from ludwig.schema.trainer import get_trainer_jsonschema
 from ludwig.schema.utils import unload_jsonschema_from_marshmallow_class
 
 VALIDATION_LOCK = Lock()

@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Module for handling contributed support."""
 
 import argparse

@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """All contrib classes must implement the `ludwig.callbacks.Callback` interface.
 
 If you don't want to handle the call, either provide an empty method with `pass`, or just don't implement the method.

@@ -2086,12 +2086,12 @@ def _preprocess_file_for_training(
 
     :param features: list of all features (input + output)
     :param dataset: path to the data
-    :param training_set:  training data
+    :param training_set: training data
     :param validation_set: validation data
     :param test_set: test data
     :param training_set_metadata: train set metadata
-    :param skip_save_processed_input: if False, the pre-processed data is saved
-    as .hdf5 files in the same location as the csv files with the same names.
+    :param skip_save_processed_input: if False, the pre-processed data is saved as .hdf5 files in the same location as
+        the csv files with the same names.
     :param preprocessing_params: preprocessing parameters
     :param random_seed: random seed
     :return: training, test, validation datasets, training metadata

@@ -64,8 +64,8 @@ def __len__(self):
     def set_epoch(self, epoch):
         """Sets the epoch for this sampler.
 
-        When `shuffle=True`, this ensures all replicas use a different random ordering
-        for each epoch. Otherwise, the next iteration of this sampler will yield the same ordering.
+        When `shuffle=True`, this ensures all replicas use a different random ordering for each epoch. Otherwise, the
+        next iteration of this sampler will yield the same ordering.
 
         :param epoch: (int) epoch number
         """

@@ -242,7 +242,7 @@ def get_datasets_output_features(
     :param include_competitions: (bool) whether to include the output features from kaggle competition datasets
     :param include_data_modalities: (bool) whether to include the data modalities associated with the prediction task
     :return: (dict) dictionary with the output features for each dataset or a dictionary with the output features for
-                    the specified dataset
+        the specified dataset
     """
     ordered_configs = OrderedDict(sorted(_get_dataset_configs().items()))
     competition_datasets = []
@@ -321,10 +321,8 @@ def _get_hf_dataset_and_subsample(dataset_name: str) -> Tuple[str, Optional[str]
 
     The dataset name should follow the format "{HF_PREFIX}{hf_id}--{hf_subsample}"
 
-    Examples (Dataset Name --> HF ID; HF subsample):
-    "hf://wikisql" --> "wikisql"; None
-    "hf://ColumbiaNLP/FLUTE" --> "ColumbiaNLP/FLUTE"; None
-    "hf://mstz/adult--income" --> "mstz/adult"; "income"
+    Examples (Dataset Name --> HF ID; HF subsample): "hf://wikisql" --> "wikisql"; None "hf://ColumbiaNLP/FLUTE" -->
+    "ColumbiaNLP/FLUTE"; None "hf://mstz/adult--income" --> "mstz/adult"; "income"
     """
     dataset_name = dataset_name[len(HF_PREFIX) :]
     dataset_name = dataset_name.split("--")

@@ -58,11 +58,8 @@ def load_unprocessed_dataframe(self, file_paths: List[str]) -> pd.DataFrame:
     def read_source_dataset(self, dataset="training", path="."):
         """Create a directory for training and test and extract all the images and labels to this destination.
 
-        :args:
-            dataset (str) : the label for the dataset
-            path (str): the raw dataset path
-        :returns:
-            A tuple of the label for the image, the file array, the size and rows and columns for the image
+        :args:     dataset (str) : the label for the dataset     path (str): the raw dataset path
+        :returns: A tuple of the label for the image, the file array, the size and rows and columns for the image
         """
         if dataset == "training":
             fname_img = os.path.join(path, "train-images-idx3-ubyte")
@@ -87,13 +84,9 @@ def read_source_dataset(self, dataset="training", path="."):
     def write_output_dataset(self, labels, images, output_dir):
         """Create output directories where we write out the images.
 
-        :args:
-            labels (str) : the labels for the image
-            data (np.array) : the binary array corresponding to the image
-            output_dir (str) : the output directory that we need to write to
-            path (str): the raw dataset path
-        :returns:
-            A tuple of the label for the image, the file array, the size and rows and columns for the image
+        :args:     labels (str) : the labels for the image     data (np.array) : the binary array corresponding to the
+        image     output_dir (str) : the output directory that we need to write to     path (str): the raw dataset path
+        :returns: A tuple of the label for the image, the file array, the size and rows and columns for the image
         """
         # create child image output directories
         output_dirs = [os.path.join(output_dir, str(i)) for i in range(NUM_LABELS)]

@@ -21,10 +21,12 @@
 
 class RandomSplitLoader(DatasetLoader):
     """Adds a random split column to the dataset, with fixed proportions of:
-     train: 70%
+
+    train: 70%
      validation: 10%
      test: 20%
-    ."""
+    .
+    """
 
     def transform_dataframe(self, dataframe: pd.DataFrame) -> pd.DataFrame:
         df = super().transform_dataframe(dataframe)

@@ -1,3 +1,4 @@
+# flake8: noqa: E501
 import logging
 import re
 from typing import Any, Dict, List, Union
@@ -91,7 +92,6 @@ def __init__(
             # Transformer Tokenizers
             self.tokenizer_vocab_size = self.tokenizer.tokenizer.vocab_size
         else:
-            # TorchText Tokenizers
             self.tokenizer_vocab_size = len(self.tokenizer.vocab)
 
         # Maximum number of new tokens that will be generated