Skip to content

Commit

Permalink
Merge pull request #288 from IBM/runtime-reorg
Browse files Browse the repository at this point in the history
Split code quality, malware and proglang select transforms into python and ray.
  • Loading branch information
daw3rd authored Jun 21, 2024
2 parents 399d2c1 + 672e4a8 commit dda6e7c
Show file tree
Hide file tree
Showing 77 changed files with 2,282 additions and 881 deletions.
6 changes: 3 additions & 3 deletions .make.defaults
Original file line number Diff line number Diff line change
Expand Up @@ -310,10 +310,10 @@ __check_defined = \
$(MAKE) PIP_TARGET=data-prep-toolkit-ray .defaults.pip-uninstall; \
$(MAKE) PYTHON_PROJECT_DIR=$(DPK_PYTHON_LIB_DIR) .defaults.install-src-venv; \
$(MAKE) PYTHON_PROJECT_DIR=$(DPK_RAY_LIB_DIR) .defaults.install-src-venv; \
if [ -d ../python ]; then \
$(MAKE) PYTHON_PROJECT_DIR=../python .defaults.install-src-venv; \
echo Installed source from Ray data processing library for `which $(PYTHON)`; \
if [ -d ../python ]; then \
$(MAKE) PYTHON_PROJECT_DIR=../python .defaults.install-src-venv; \
fi
echo Installed source from Ray data processing library for `which $(PYTHON)`

# Install local requirements last as it generally includes our lib source
.PHONY: .defaults.spark-lib-src-venv
Expand Down
2 changes: 2 additions & 0 deletions .make.versions
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,12 @@ LANG_ID_RAY_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX)
TOKENIZATION_RAY_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX)
TOKENIZATION_PYTHON_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX)

MALWARE_PYTHON_VERSION=0.5.0$(RELEASE_VERSION_SUFFIX)
MALWARE_RAY_VERSION=0.5.0$(RELEASE_VERSION_SUFFIX)

PROGLANG_SELECT_RAY_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX)

CODE_QUALITY_PYTHON_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX)
CODE_QUALITY_RAY_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX)

INGEST_TO_PARQUET_RAY_VERSION=0.4.0$(RELEASE_VERSION_SUFFIX)
Expand Down
13 changes: 13 additions & 0 deletions transforms/code/code_quality/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Code Quality Transform
The Code Quality transforms
captures code specific metrics of input data.
Per the set of
[transform project conventions](../../README.md#transform-project-conventions)
the following runtimes are available:

* [python](python/README.md) - provides the base python-based transformation
implementation.
* [ray](ray/README.md) - enables the running of the base python transformation
in a Ray runtime
* [kfp](kfp_ray/README.md) - enables running the ray docker image
in a kubernetes cluster using a generated `yaml` file.
1 change: 1 addition & 0 deletions transforms/code/code_quality/python/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
venv/
37 changes: 37 additions & 0 deletions transforms/code/code_quality/python/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
test-data/output
output/*
/output/
data-processing-lib/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class


# Distribution / packaging
bin/
build/
develop-eggs/
dist/
eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
.tox/
htmlcov
.coverage
.cache
nosetests.xml
coverage.xml
42 changes: 42 additions & 0 deletions transforms/code/code_quality/python/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
FROM docker.io/python:3.10.14-slim-bullseye

RUN pip install --upgrade pip

# install pytest
RUN pip install --no-cache-dir pytest

# Create a user and use it to run the transform
RUN useradd -ms /bin/bash dpk
USER dpk
WORKDIR /home/dpk

# Copy and install data processing libraries
# These are expected to be placed in the docker context before this is run (see the make image).
COPY --chown=dpk:root data-processing-lib-python/ data-processing-lib-python/
RUN cd data-processing-lib-python && pip install --no-cache-dir -e .

# END OF STEPS destined for a data-prep-kit base image

COPY --chown=dpk:root src/ src/
COPY --chown=dpk:root pyproject.toml pyproject.toml
RUN pip install --no-cache-dir -e .

#COPY requirements.txt requirements.txt
#RUN pip install --no-cache-dir -r requirements.txt

# copy source data
COPY ./src/code_quality_transform_python.py .
COPY ./src/code_quality_local.py local/

# copy test
COPY test/ test/
COPY test-data/ test-data/

# Set environment
ENV PYTHONPATH /home/dpk

# Put these at the end since they seem to upset the docker cache.
ARG BUILD_DATE
ARG GIT_COMMIT
LABEL build-date=$BUILD_DATE
LABEL git-commit=$GIT_COMMIT
51 changes: 51 additions & 0 deletions transforms/code/code_quality/python/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@

# Define the root of the local git clone for the common rules to be able
# know where they are running from.
REPOROOT=../../../..
include $(REPOROOT)/transforms/.make.transforms

TRANSFORM_NAME=code_quality
# $(REPOROOT)/.make.versions file contains the versions
DOCKER_IMAGE_VERSION=${CODE_QUALITY_RAY_VERSION}

# Use default rule inherited from makefile.common
clean:: .transforms.clean

# Use default rule inherited from makefile.common
test:: .transforms.python-test

# Use default rule inherited from makefile.common
image:: .transforms.python-image

# Use default rule inherited from makefile.common
venv:: .transforms.python-venv

test-src:: .transforms.test-src

test-image:: .transforms.python-test-image

build:: build-dist image

publish:: publish-dist publish-image

publish-image:: .transforms.publish-image-python

setup:: .transforms.setup

# distribution versions is the same as image version.
set-versions:
$(MAKE) TOML_VERSION=$(DOCKER_IMAGE_VERSION) .defaults.update-toml

build-dist:: set-versions .defaults.build-dist

publish-dist:: .defaults.publish-dist

setup:: .transforms.setup

run-cli-sample: .transforms.run-cli-python-sample

run-local-sample: .transforms.run-local-sample

run-local-python-sample: .transforms.run-local-python-sample

load-image:: .transforms.load-image
67 changes: 67 additions & 0 deletions transforms/code/code_quality/python/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Code Quality

Please see the set of
[transform project conventions](../../../README.md)
for details on general project conventions, transform configuration,
testing and IDE set up.

## Summary
This module captures code specific metrics of input data. The implementation is borrowed from the work done in [CodeParrot](https://huggingface.co/blog/codeparrot) and [StarCoder](https://arxiv.org/abs/2305.06161) projects. In the current implementation, the module includes the following metrics & reports each metrics in individual column:

* line specific metrics include mean & max line length
* character and token ratio - uses the input tokenizer to tokenize the input data & measure the ratio between the characters and tokens
* identifies the high occurrence of the keywords "test " or "config" and tags them as config or test samples
* tags the samples as autogenerated if the sample contains keywords like `auto-generated`, `autogenerated` or `automatically generated`
* programming language specific identification, where:
* if the input sample is `python` programming language and sample has no reference to constructs like def, class, it is highlighted as `has_no_keywords`

This module adds the following fields into the output file:
<ul>
<li>line_mean</li>
<li>line_max</li>
<li>total_num_lines</li>
<li>avg_longest_lines</li>
<li>alphanum_frac</li>
<li>char_token_ratio</li>
<li>autogenerated</li>
<li>config_or_test</li>
<li>has_no_keywords</li>
<li>has_few_assignments</li>
<li>is_xml</li>
<li>is_html</li>
</ul>

It uses a tokenizer to collect metrics specific to token ratio. It is designed to download the tokenizer from the [Huggingface](https://huggingface.co/) if the input tokenizer is not found in the local cache. By default, it uses [codeparrot/codeparrot](https://huggingface.co/codeparrot/codeparrot) tokenizer.

## Running

### Launcher Command Line Options

The following command line arguments are available in addition to
the options provided by the [ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md)
and the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md).

* "--contents_column_name" - input a column name which contains data to process. The default column name: `contents`
* "--language_column_name" - input a column name which contains programming language details. The default column name: `language`
* "--tokenizer" - input a tokenizer to convert the data into tokens. The default tokenizer is `codeparrot/codeparrot`
* "--hf_token" - input the Hugging Face auth token to download the tokenizer. This option is only required for the tokenizer's whose access is restricted in Hugging Face.

### Running the samples
To run the samples, use the following `make` targets

* `run-cli-sample` - runs src/code_quality_transform_python.py using command line args
* `run-local-sample` - runs src/code_quality_local_python.py

These targets will activate the virtual environment and set up any configuration needed.
Use the `-n` option of `make` to see the detail of what is done to run the sample.

For example,
```shell
make run-cli-sample
...
```
Then
```shell
ls output
```
To see results of the transform.
46 changes: 46 additions & 0 deletions transforms/code/code_quality/python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
[project]
name = "dpk_code_quality_transform_python"
version = "0.4.0.dev6"
requires-python = ">=3.10"
description = "Code Quality Python Transform"
license = {text = "Apache-2.0"}
readme = {file = "README.md", content-type = "text/markdown"}
authors = [
{ name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" },
]
dependencies = [
"data-prep-toolkit==0.2.0.dev6",
"bs4==0.0.2",
"transformers==4.38.2",
]

[build-system]
requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
build-backend = "setuptools.build_meta"

[project.optional-dependencies]
dev = [
"twine",
"pytest>=7.3.2",
"pytest-dotenv>=0.5.2",
"pytest-env>=1.0.0",
"pre-commit>=3.3.2",
"pytest-cov>=4.1.0",
"pytest-mock>=3.10.0",
"moto==5.0.5",
"markupsafe==2.0.1",
]

[options]
package_dir = ["src","test"]

[options.packages.find]
where = ["src/"]

[tool.pytest.ini_options]
# Currently we use low coverage since we have to run tests separately (see makefile)
#addopts = "--cov --cov-report term-missing --cov-fail-under 25"
markers = ["unit: unit tests", "integration: integration tests"]

[tool.coverage.run]
include = ["src/*"]
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

import os

from code_quality_transform_ray import CodeQualityTransform
from code_quality_transform import CodeQualityTransform
from data_processing.data_access import DataAccessLocal


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import sys
from pathlib import Path

from code_quality_transform_ray import CodeQualityTransformConfiguration
from code_quality_transform import CodeQualityTransformConfiguration
from data_processing.runtime.pure_python import PythonTransformLauncher
from data_processing.utils import ParamsUtils

Expand Down
Loading

0 comments on commit dda6e7c

Please sign in to comment.