diff --git a/pkg/fe/builder/overlay/source.go b/pkg/fe/builder/overlay/source.go index 3605bde5..083b3107 100644 --- a/pkg/fe/builder/overlay/source.go +++ b/pkg/fe/builder/overlay/source.go @@ -5,22 +5,21 @@ import ( "io/fs" "os" "path/filepath" + "regexp" + "slices" + "strings" "gopkg.in/yaml.v3" "lunchpail.io/pkg/ir/hlir" ) +// Formulate an HLIR for the source in the given `sourcePath` and write it out to the `templatePath` func copySourceIntoTemplate(appname, sourcePath, templatePath string, opts Options) (appVersion string, err error) { if opts.Verbose() { fmt.Fprintln(os.Stderr, "Copying application source into", appdir(templatePath)) } - appVersion, err = addHLIRFromSource(appname, sourcePath, templatePath, opts) - return -} - -func addHLIRFromSource(appname, sourcePath, templatePath string, opts Options) (string, error) { appVersion, app, err := applicationFromSource(appname, sourcePath, templatePath, opts) if err != nil { return "", err @@ -38,22 +37,47 @@ func addHLIRFromSource(appname, sourcePath, templatePath string, opts Options) ( return appVersion, nil } +// Formulate an HLIR for the source in the given `sourcePath` func applicationFromSource(appname, sourcePath, templatePath string, opts Options) (appVersion string, app hlir.Application, err error) { app = hlir.NewWorkerApplication(appname) spec := &app.Spec - filepath.WalkDir(sourcePath, func(path string, d fs.DirEntry, err error) error { + maybeImage := "" + maybeCommand := "" + + // While walking the directory structure, these are the noteworthy subdirectories + srcPrefix := filepath.Join(sourcePath, "src") + + err = filepath.WalkDir(sourcePath, func(path string, d fs.DirEntry, err error) error { switch { case d.IsDir(): - // skip directories - case filepath.Ext(path) == ".html" || filepath.Ext(path) == ".gz" || filepath.Ext(path) == ".zip" || filepath.Ext(path) == ".parquet": - // skip data files + // skip directories, except to remember which "mode" we are in + case filepath.Ext(path) == ".pdf" || filepath.Ext(path) == ".html" || filepath.Ext(path) == ".gz" || filepath.Ext(path) == ".zip" || filepath.Ext(path) == ".parquet": + // skip data files; TODO add support for .ignore + case path[len(path)-1] == '~': + // skip emacs temporary files default: b, err := os.ReadFile(path) if err != nil { return err } + if strings.HasPrefix(path, srcPrefix) { + // Handle src/ artifacts + spec.Code = append(spec.Code, hlir.Code{Name: d.Name(), Source: string(b)}) + + switch d.Name() { + case "main.sh": + maybeCommand = "./main.sh" + maybeImage = "docker.io/alpine:3" + case "main.py": + maybeCommand = "python3 main.py" + maybeImage = "docker.io/python:3.12" + } + return nil + } + + // Handle non-src artifacts switch d.Name() { case "version", "version.txt": if appVersion, err = handleVersionFile(path); err != nil { @@ -61,22 +85,44 @@ func applicationFromSource(appname, sourcePath, templatePath string, opts Option } case "requirements.txt": spec.Needs = append(spec.Needs, hlir.Needs{Name: "python", Version: "latest", Requirements: string(b)}) + case "memory", "memory.txt": + spec.MinMemory = string(b) + case "image": + spec.Image = string(b) + case "command": + spec.Command = string(b) + case "env.yaml": + err := yaml.Unmarshal(b, &spec.Env) + if err != nil { + return fmt.Errorf("Error parsing env.yaml: %v", err) + } default: - spec.Code = append(spec.Code, hlir.Code{Name: d.Name(), Source: string(b)}) - } - - switch d.Name() { - case "main.sh": - spec.Command = "./main.sh" - spec.Image = "docker.io/alpine:3" - case "main.py": - spec.Command = "python3 main.py" - spec.Image = "docker.io/python:3.12" + if opts.Verbose() { + fmt.Fprintln(os.Stderr, "Skipping application artifact", strings.Replace(path, sourcePath, "", 1)) + } } } return nil }) + if spec.Command == "" && maybeCommand != "" { + spec.Command = maybeCommand + } + if spec.Image == "" && maybeImage != "" { + spec.Image = maybeImage + } + + pyNeedsIdx := slices.IndexFunc(spec.Needs, func(n hlir.Needs) bool { return n.Name == "python" && n.Version == "latest" }) + if pyNeedsIdx >= 0 && strings.HasPrefix(spec.Command, "python3") { + version := regexp.MustCompile("\\d.\\d+").FindString(spec.Command) + if version != "" { + if opts.Verbose() { + fmt.Fprintln(os.Stderr, "Using Python version", version) + } + spec.Needs[pyNeedsIdx].Version = version + } + } + return } diff --git a/pkg/runtime/needs/install_darwin.go b/pkg/runtime/needs/install_darwin.go index 82b642ad..a5af3b27 100644 --- a/pkg/runtime/needs/install_darwin.go +++ b/pkg/runtime/needs/install_darwin.go @@ -35,7 +35,11 @@ func installPython(ctx context.Context, version string, verbose bool) (string, e return "", err } - return "", brewInstall(ctx, "python3", version, verbose) //Todo: versions other than latest + python := "python@" + version + if version == "" || version == "latest" { + python = "python3" + } + return "", brewInstall(ctx, python, version, verbose) //Todo: versions other than latest } func brewInstall(ctx context.Context, pkg string, version string, verbose bool) error { diff --git a/pkg/runtime/needs/install_linux.go b/pkg/runtime/needs/install_linux.go index bebd7ce5..6f2987ca 100644 --- a/pkg/runtime/needs/install_linux.go +++ b/pkg/runtime/needs/install_linux.go @@ -2,9 +2,13 @@ package needs import ( "context" + "io" + "net/http" "os" "os/exec" "path/filepath" + + "github.com/hairyhenderson/go-which" ) func bindir() (string, error) { @@ -45,47 +49,74 @@ func installMinio(ctx context.Context, version string, verbose bool) (string, er } func installPython(ctx context.Context, version string, verbose bool) (string, error) { - /* - if verbose { - fmt.Fprintf(os.Stdout, "Installing %s release of python \n", version) + if version == "" || version == "latest" { + version = "3" + } + + if which.Which("python"+version) == "" { + if verbose { + fmt.Fprintf(os.Stdout, "Installing python%s\n", version) + } + + dir, err := bindir() + if err != nil { + return err + } + + if err := os.MkdirAll(dir, 0755); err != nil { + return err + } + + downloadVersion := "3.13.0" + switch version { + case "3.12": + downloadVersion = "3.12.7" + case "3.11": + downloadVersion = "3.11.10" + case "3.10": + downloadVersion = "3.10.15" + case "3.9": + downloadVersion = "3.9.20" + case "3.8": + downloadVersion = "3.8.20" + case "3.7": + downloadVersion = "3.7.17" + } + + tarball := fmt.Sprintf("Python-%s.tgz", downloadVersion) + resp, err := http.Get(fmt.Sprintf("https://www.python.org/ftp/python/%s/%s", downloadVersion, tarball)) + if err != nil { + return "", err } + defer resp.Body.Close() - dir, err := bindir() - if err != nil { - return err - } - - if err := os.MkdirAll(dir, 0755); err != nil { - return err - } - - //Todo: versions other than latest - cmd := exec.Command("wget", "https://www.python.org/ftp/python/3.12.7/Python-3.12.7.tgz") - cmd.Dir = dir - if verbose { - cmd.Stdout = os.Stdout - } - cmd.Stderr = os.Stderr - if err := cmd.Run(); err != nil { - return err - } - - cmd = exec.Command("tar", "xf", "Python-3.12.7.tgz") - cmd.Dir = dir - if verbose { - cmd.Stdout = os.Stdout - } - cmd.Stderr = os.Stderr - if err := cmd.Run(); err != nil { - return err - } - - if err := setenv(dir); err != nil { //setting $PATH - return err - } - - os.Chmod(filepath.Join(dir, "python"), 0755) - */ + out, err := os.Create(tarball) + if err != nil { + return "", err + } + + if _, err := io.Copy(out, resp.Body); err != nil { + return "", err + } + out.Close() + defer os.Remove(tarball) + + cmd = exec.Command("tar", "-C", dir, "xf", tarball) + if verbose { + cmd.Stdout = os.Stdout + } + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + return "", err + } + + if err := setenv(dir); err != nil { //setting $PATH + return "", err + } + + os.Chmod(filepath.Join(dir, "python"), 0755) + os.Chmod(filepath.Join(dir, "python"+version), 0755) + } return "", nil } diff --git a/pkg/runtime/needs/install_requirements.go b/pkg/runtime/needs/install_requirements.go index a5f22610..ea2ab6d3 100644 --- a/pkg/runtime/needs/install_requirements.go +++ b/pkg/runtime/needs/install_requirements.go @@ -11,7 +11,7 @@ import ( "syscall" ) -func requirementsInstall(ctx context.Context, requirements string, verbose bool) (string, error) { +func requirementsInstall(ctx context.Context, version, requirements string, verbose bool) (string, error) { var cmd *exec.Cmd var verboseFlag string var reqmtsByte []byte @@ -84,10 +84,14 @@ func requirementsInstall(ctx context.Context, requirements string, verbose bool) quiet = "" } + if version == "" || version == "latest" { + version = "3" + } + cmds := fmt.Sprintf(`python3 -m venv %s source %s/bin/activate -if ! which pip3; then python3 -m pip install pip %s; fi -pip3 install %s %s -r %s %s 1>&2`, venvPath, venvPath, verboseFlag, nocache, quiet, reqmtsFile.Name(), verboseFlag) +if ! which pip%s; then python%s -m pip install pip %s; fi +pip%s install %s %s -r %s %s 1>&2`, venvPath, venvPath, version, version, verboseFlag, version, nocache, quiet, reqmtsFile.Name(), verboseFlag) cmd = exec.CommandContext(ctx, "/bin/bash", "-c", cmds) cmd.Dir = filepath.Dir(venvPath) diff --git a/pkg/runtime/needs/python.go b/pkg/runtime/needs/python.go index e418eb00..e8ea4779 100644 --- a/pkg/runtime/needs/python.go +++ b/pkg/runtime/needs/python.go @@ -6,8 +6,12 @@ import ( "os/exec" ) -func InstallPython(ctx context.Context, version string, requirements string, opts Options) (string, error) { - if _, err := exec.LookPath("python3"); err != nil { +func InstallPython(ctx context.Context, version, requirements string, opts Options) (string, error) { + if version == "" || version == "latest" { + version = "3" + } + + if _, err := exec.LookPath("python" + version); err != nil { if errors.Is(err, exec.ErrNotFound) { if _, err := installPython(ctx, version, opts.Verbose); err != nil { return "", err @@ -17,7 +21,7 @@ func InstallPython(ctx context.Context, version string, requirements string, opt } if requirements != "" { //returns bin path where installed - return requirementsInstall(ctx, requirements, opts.Verbose) + return requirementsInstall(ctx, version, requirements, opts.Verbose) } return "", nil } diff --git a/tests/tests/python-basic/pail/main.py b/tests/tests/python-basic/pail/src/main.py similarity index 100% rename from tests/tests/python-basic/pail/main.py rename to tests/tests/python-basic/pail/src/main.py diff --git a/tests/tests/python-code-code-quality/pail/main.py b/tests/tests/python-code-code-quality/pail/src/main.py similarity index 100% rename from tests/tests/python-code-code-quality/pail/main.py rename to tests/tests/python-code-code-quality/pail/src/main.py diff --git a/tests/tests/python-language-html2parquet/pail/main.py b/tests/tests/python-language-html2parquet/pail/src/main.py similarity index 100% rename from tests/tests/python-language-html2parquet/pail/main.py rename to tests/tests/python-language-html2parquet/pail/src/main.py diff --git a/tests/tests/python-language-pdf2parquet/pail/app.yaml b/tests/tests/python-language-pdf2parquet/pail/app.yaml deleted file mode 100644 index 9cff5031..00000000 --- a/tests/tests/python-language-pdf2parquet/pail/app.yaml +++ /dev/null @@ -1,22 +0,0 @@ -apiVersion: lunchpail.io/v1alpha1 -kind: Application -metadata: - name: doc_chunk -spec: - image: docker.io/python:3.12 - command: python3 ./main.py - minSize: sm - env: - USE_NNPACK: '0' # otherwise torch fails with "Could not initialize NNPACK! Reason: Unsupported hardware" on ARM (lack of AVX instructions) - needs: - - name: python - version: latest - requirements: | -{{ .Files.Get "data/requirements.txt" | indent 8 }} - code: - - name: main.py - source: | -{{ .Files.Get "src/main.py" | indent 8 }} - - name: doc_chunk_chunkers.py - source: | -{{ .Files.Get "src/doc_chunk_chunkers.py" | indent 8 }} diff --git a/tests/tests/python-language-pdf2parquet/pail/command b/tests/tests/python-language-pdf2parquet/pail/command new file mode 100644 index 00000000..640bb4cf --- /dev/null +++ b/tests/tests/python-language-pdf2parquet/pail/command @@ -0,0 +1 @@ +python3.11 main.py \ No newline at end of file diff --git a/tests/tests/python-language-pdf2parquet/pail/data/requirements.txt b/tests/tests/python-language-pdf2parquet/pail/data/requirements.txt deleted file mode 100644 index cbcebc76..00000000 --- a/tests/tests/python-language-pdf2parquet/pail/data/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -docling-core==2.3.0 -docling-ibm-models==2.0.3 -deepsearch-glm==0.26.1 -docling==2.3.1 -filetype >=1.2.0, <2.0.0 - -# we can probably update to 18+, but we will have to re-generate expected output as pyarrow 18 seems to have resulted in a binary format change -pyarrow<18 diff --git a/tests/tests/python-language-pdf2parquet/pail/env.yaml b/tests/tests/python-language-pdf2parquet/pail/env.yaml new file mode 100644 index 00000000..be589dee --- /dev/null +++ b/tests/tests/python-language-pdf2parquet/pail/env.yaml @@ -0,0 +1 @@ +USE_NNPACK: '0' # otherwise torch fails with "Could not initialize NNPACK! Reason: Unsupported hardware" on ARM (lack of AVX instructions) diff --git a/tests/tests/python-language-pdf2parquet/pail/image b/tests/tests/python-language-pdf2parquet/pail/image new file mode 100644 index 00000000..820d23cc --- /dev/null +++ b/tests/tests/python-language-pdf2parquet/pail/image @@ -0,0 +1 @@ +docker.io/python:3.11 \ No newline at end of file diff --git a/tests/tests/python-language-pdf2parquet/pail/requirements.txt b/tests/tests/python-language-pdf2parquet/pail/requirements.txt new file mode 100644 index 00000000..16a5eeb2 --- /dev/null +++ b/tests/tests/python-language-pdf2parquet/pail/requirements.txt @@ -0,0 +1,7 @@ +docling-core==2.3.0 +docling-ibm-models==2.0.3 +deepsearch-glm==0.26.1 +docling==2.3.1 +filetype >=1.2.0, <2.0.0 + +pyarrow<18 diff --git a/tests/tests/python-language-pdf2parquet/settings.sh b/tests/tests/python-language-pdf2parquet/settings.sh index 303a4402..6aa408f8 100644 --- a/tests/tests/python-language-pdf2parquet/settings.sh +++ b/tests/tests/python-language-pdf2parquet/settings.sh @@ -3,4 +3,7 @@ api=workqueue expected=("Done with nrows=1 nsuccess=1 nfail=0 nskip=0" "Done with nrows=2 nsuccess=2 nfail=0 nskip=0") NUM_DESIRED_OUTPUTS=0 +# the default is --yaml. we don't want that +source_from=" " + up_args='"$TEST_PATH"/pail/test-data/input/redp5110-ch1.pdf "$TEST_PATH"/pail/test-data/input/archive1.zip' diff --git a/tests/tests/test7f/pail/main.sh b/tests/tests/test7f/pail/src/main.sh similarity index 100% rename from tests/tests/test7f/pail/main.sh rename to tests/tests/test7f/pail/src/main.sh