Merge branch 'fix/align-image-classification-tutorial' of https://git…

…hub.com/burtenshaw/argilla into fix/align-image-classification-tutorial
argilla-io · Mar 28, 2023 · 98116a1 · 98116a1
2 parents 3be37ea + 8da6ccf
commit 98116a1
Show file tree

Hide file tree

Showing 148 changed files with 4,473 additions and 1,611 deletions.
diff --git a/.github/workflows/package.yml b/.github/workflows/package.yml
@@ -19,6 +19,11 @@ on:
       - "feature/**"
       - "feat/**"
 
+env:
+  # Increase this value to reset cache if etc/example-environment.yml has not changed
+  CACHE_NUMBER:  5
+  ALEMBIC_CONFIG: src/argilla/alembic.ini
+
 jobs:
   test-elastic:
     name: Tests ElasticSearch
@@ -71,9 +76,6 @@ jobs:
         with:
           path: ${{ env.CONDA }}/envs
           key: conda-${{ runner.os }}--${{ runner.arch }}--${{ steps.get-date.outputs.today }}-${{ hashFiles('environment_dev.yml') }}-${{ env.CACHE_NUMBER }}
-        env:
-          # Increase this value to reset cache if etc/example-environment.yml has not changed
-          CACHE_NUMBER: 2
 
       - name: Update environment
         if: steps.filter.outputs.python_code == 'true' && steps.cache.outputs.cache-hit != 'true'
@@ -82,9 +84,6 @@ jobs:
       - name: Cache pip 👜
         uses: actions/cache@v2
         if: steps.filter.outputs.python_code == 'true'
-        env:
-          # Increase this value to reset cache if pyproject.toml has not changed
-          CACHE_NUMBER: 0
         with:
           path: ~/.cache/pip
           key: ${{ runner.os }}-pip-${{ env.CACHE_NUMBER }}-${{ hashFiles('pyproject.toml') }}
@@ -98,6 +97,8 @@ jobs:
       - name: Run tests 📈
         if: steps.filter.outputs.python_code == 'true'
         run: |
+          pip install -e ".[server,listeners]"
+          alembic upgrade head
           pytest --cov=argilla --cov-report=xml
           pip install "spacy<3.0" && python -m spacy download en_core_web_sm
           pytest tests/monitoring/test_spacy_monitoring.py
@@ -115,6 +116,7 @@ jobs:
   test-opensearch:
     name: Test OpenSearch
     runs-on: ubuntu-latest
+
     strategy:
       matrix:
         version: [ 1.3, 2.3 ]
@@ -161,9 +163,6 @@ jobs:
         with:
           path: ${{ env.CONDA }}/envs
           key: conda-${{ runner.os }}--${{ runner.arch }}--${{ steps.get-date.outputs.today }}-${{ hashFiles('environment_dev.yml') }}-${{ env.CACHE_NUMBER }}
-        env:
-          # Increase this value to reset cache if etc/example-environment.yml has not changed
-          CACHE_NUMBER: 2
 
       - name: Update environment
         if: steps.filter.outputs.python_code == 'true' && steps.cache.outputs.cache-hit != 'true'
@@ -172,9 +171,6 @@ jobs:
       - name: Cache pip 👜
         uses: actions/cache@v2
         if: steps.filter.outputs.python_code == 'true'
-        env:
-          # Increase this value to reset cache if pyproject.toml has not changed
-          CACHE_NUMBER: 0
         with:
           path: ~/.cache/pip
           key: ${{ runner.os }}-pip-${{ env.CACHE_NUMBER }}-${{ hashFiles('pyproject.toml') }}
@@ -188,6 +184,8 @@ jobs:
       - name: Run tests 📈
         if: steps.filter.outputs.python_code == 'true'
         run: |
+          pip install -e ".[server,listeners]"
+          alembic upgrade head
           pytest --cov=argilla --cov-report=xml
           pip install "spacy<3.0" && python -m spacy download en_core_web_sm
           pytest tests/monitoring/test_spacy_monitoring.py
@@ -333,12 +331,13 @@ jobs:
 
       - name: Docker Hub Description
         uses: peter-evans/dockerhub-description@v3
+        if: github.event_name == 'release' && env.IS_DEPLOYABLE == 'true'
         with:
           username: ${{ secrets.AR_DOCKER_USERNAME }}
           password: ${{ secrets.AR_DOCKER_PASSWORD }}
           repository: ${{ matrix.image }}
           readme-filepath: ${{ matrix.readme }}
-        if: env.IS_DEPLOYABLE == 'true'
+
 
   # This job will upload a Python Package using Twine when a release is created
   # For more information see:

diff --git a/.gitignore b/.gitignore
@@ -130,9 +130,11 @@ sw.*
 # Vim swap files
 *.swp
 
-
 yarn.lock
 package-lock.json
 
 # App generated files
 src/**/server/static/
+
+# Old users db file
+.users.yml
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,11 +7,47 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+
+- `ARGILLA_HOME_PATH` new environment variable ([#2564]).
+- `ARGILLA_DATABASE_URL` new environment variable ([#2564]).
+- Basic support for user roles with `admin` and `annotator` ([#2564]).
+- `id`, `first_name`, `last_name`, `role`, `inserted_at` and `updated_at` new user fields ([#2564]).
+- `/api/users` new endpoint to list and create users ([#2564]).
+- `/api/users/{user_id}` new endpoint to delete users ([#2564]).
+- `/api/workspaces` new endpoint to list and create workspaces ([#2564]).
+- `/api/workspaces/{workspace_id}/users` new endpoint to list workspace users ([#2564]).
+- `/api/workspaces/{workspace_id}/users/{user_id}` new endpoint to create and delete workspace users ([#2564]).
+- `argilla.tasks.users.migrate` new task to migrate users from old YAML file to database ([#2564]).
+- `argilla.tasks.users.create` new task to create a user ([#2564]).
+- `argilla.tasks.users.create_default` new task to create a user with default credentials ([#2564]).
+- `argilla.tasks.database.migrate` new task to execute database migrations ([#2564]).
+- `release.Dockerfile` and `quickstart.Dockerfile` now creates a default `argilladata` volume to persist data ([#2564]).
+
+### Changed
+
+- `ARGILLA_USERS_DB_FILE` environment variable now it's only used to migrate users from YAML file to database ([#2564]).
+- `full_name` user field is now deprecated and `first_name` and `last_name` should be used instead ([#2564]).
+- `password` user field now requires a minimum of `8` and a maximum of `100` characters in size ([#2564]).
+- `quickstart.Dockerfile` image default users from `team` and `argilla` to `admin` and `annotator` including new passwords and API keys ([#2564]).
+- Datasets to be managed only by users with `admin` role ([#2564]).
+
 ### Fixes
 
 - Copying datasets between workspaces with proper owner/workspace info. Closes [#2562](https://github.com/argilla-io/argilla/issues/2562)
 - Using elasticsearch config to request backend version. Closes [#2311](https://github.com/argilla-io/argilla/issues/2311)
 
+
+### Removed
+
+- `email` user field ([#2564]).
+- `disabled` user field ([#2564]).
+- Support for private workspaces ([#2564]).
+- `ARGILLA_LOCAL_AUTH_DEFAULT_APIKEY` and `ARGILLA_LOCAL_AUTH_DEFAULT_PASSWORD` environment variables. Use `python -m argilla.tasks.users.create_default` instead ([#2564]).
+
+[#2564]: https://github.com/argilla-io/argilla/issues/2564
+
+
 ## [1.5.0](https://github.com/recognai/rubrix/compare/v1.4.0...v1.5.0) - 2023-03-21
 
 ### Added

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,2 +1,3 @@
+include src/argilla/alembic.ini
 graft src/argilla/server/static
-prune docs
+prune docs
diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -8,17 +8,24 @@ services:
     ports:
       - "6900:6900"
     environment:
+      ARGILLA_HOME_PATH: /var/lib/argilla
       ARGILLA_ELASTICSEARCH: http://elasticsearch:9200
-      # Opt-out for telemetry https://docs.argilla.io/en/latest/reference/telemetry.html
-      # ARGILLA_ENABLE_TELEMETRY: 0
+      # ARGILLA_ENABLE_TELEMETRY: 0 # Opt-out for telemetry https://docs.argilla.io/en/latest/reference/telemetry.html
 
       # Set user configuration https://docs.argilla.io/en/latest/getting_started/installation/user_management.html
       # ARGILLA_LOCAL_AUTH_USERS_DB_FILE: /config/.users.yaml
       # volumes:
       #- ${PWD}/.users.yaml:/config/.users.yaml
+
+      # DEFAULT_USER_ENABLED: false # Uncomment this line to disable the creation of the default user
+      # DEFAULT_USER_PASSWORD: custom-password # Uncomment this line to set a custom password for the default user
+      # DEFAULT_USER_API_KEY: custom-api-key # Uncomment this line to set a custom api-key for the default user
     networks:
       - argilla
-
+    volumes:
+      # ARGILLA_HOME_PATH is used to define where Argilla will save it's application data.
+      # If you change ARGILLA_HOME_PATH value please copy that same value to argilladata volume too.
+      - argilladata:/var/lib/argilla
   elasticsearch:
     image: docker.elastic.co/elasticsearch/elasticsearch:8.5.3
     environment:
@@ -48,8 +55,10 @@ services:
       ELASTICSEARCH_HOSTS: '["http://elasticsearch:9200"]'
     networks:
       - argilla
+
 networks:
   argilla:
     driver: bridge
 volumes:
+  argilladata:
   elasticdata:
diff --git a/docs/_source/_common/techniques_points.md b/docs/_source/_common/techniques_points.md
@@ -20,4 +20,7 @@ Are you unsure about what these techniques entail, take a look at our [terminolo
 ```{grid-item-card} 🔫 Few-shot classification
 :link: ./few_shot.html
 ```
+```{grid-item-card} 🔦 Semantic Search
+:link: ./semantic_search.html
+```
 ````
diff --git a/...e-bulk-labelling/argilla_image_cables.png → ...formers-semantic/argilla_image_cables.png b/...e-bulk-labelling/argilla_image_cables.png → ...formers-semantic/argilla_image_cables.png
diff --git a/...c/tutorials/labelling-textclassification-sentencetransformers-semantic/modal.md b/...c/tutorials/labelling-textclassification-sentencetransformers-semantic/modal.md
@@ -0,0 +1,8 @@
+```{grid-item-card} 📸 Bulk Labelling Multimodal Data
+:img-top: /_static/tutorials/labelling-textclassification-sentencetransformers-semantic/4.png
+:link: ../../tutorials/notebooks/labelling-textclassification-sentencetransformers-semantic.html
+MLOps Steps: Labelling \
+NLP Tasks: TextClassification (images) \
+Libraries: Argilla, sentence-transformers \
+Techniques: Semantic search
+```
diff --git a/.../_static/tutorials/monitoring-textclassification-setfit-explainability/modal.md b/.../_static/tutorials/monitoring-textclassification-setfit-explainability/modal.md
@@ -1,4 +1,4 @@
-```{grid-item-card} 🕵️‍♀️ Compare two Text Classification zero-shot models using Argilla
+```{grid-item-card} 🥇 Compare Text Classification Models
 :img-top: /_static/tutorials/monitoring-textclassification-setfit-explainability/f1-score-for-zero-shot-model.png
 :link: ../../tutorials/notebooks/monitoring-textclassification-setfit-explainability.html
 

diff --git a/docs/_source/community/developer_docs.rst b/docs/_source/community/developer_docs.rst
@@ -68,11 +68,11 @@ Building the documentation
 --------------------------
 
 To build the documentation, make sure you set up your system for *Argilla* development.
-Then go to the `docs` folder in your cloned repo and execute the ``make`` command:
+Then go to the `docs/_source` folder in your cloned repo and execute the ``make html`` command:
 
 .. code-block:: bash
 
-    cd docs
+    cd docs/_source
     make html
 
 This will create a ``_build/html`` folder in which you can find the ``index.html`` file of the documentation.
@@ -81,4 +81,4 @@ Alternatively, you can use install and `sphinx-autobuild` to continuously deploy
 
 .. code-block:: bash
 
-    sphinx-autobuild docs/_source docs/_build/html
+    sphinx-autobuild docs/_source docs/_build/html
diff --git a/docs/_source/conf.py b/docs/_source/conf.py
@@ -31,6 +31,7 @@
 
 # -- Project information -----------------------------------------------------
 import os
+from datetime import datetime
 
 try:
     import argilla as rg
@@ -41,7 +42,7 @@
 
 
 project = "Argilla"
-copyright = "2022, Argilla.io"
+copyright = f"{datetime.today().year}, Argilla.io"
 author = "Argilla.io"
 
 # Normally the full version, including alpha/beta/rc tags.

diff --git a/docs/_source/getting_started/argilla.md b/docs/_source/getting_started/argilla.md
@@ -19,7 +19,7 @@ Deploy your own Argilla Server on Spaces with a few clicks:
 
 🆕 Use embeddings to find the most similar records with the UI. This feature uses vector search combined with traditional search (keyword and filter based).
 
-Get started: [Semantic Search Deep-dive guide](../guides/features/semantic-search.ipynb)
+Get started: [Semantic Search Deep-dive guide](../guides/label_records_with_semanticsearch.ipynb)
 
 ```
 

diff --git a/docs/_source/getting_started/installation/configurations/elasticsearch.md b/docs/_source/getting_started/installation/configurations/elasticsearch.md
@@ -41,17 +41,17 @@ and then provide the analyzer name using the specific environment variable.
 
 ## Reindex data
 
-Sometimes updates require reindexing our dataset metrics and Elasticsearch, therefore we devised some [short documentation](../../guides/features/datasets) to show you how to do this from our Python client.
+Sometimes updates require reindexing our dataset metrics and Elasticsearch, therefore we devised some [short documentation](../../guides/log_load_and_prepare_data) to show you how to do this from our Python client.
 
 ## Backups using snapshots
 
 Within Elastic, it is possible to create snapshots of a running cluster. We highly recommend doing this to ensure experiment reproducibility and to not risk losing your valuable annotated data. Elastic offers an overview of how to do this within [their docs](https://www.elastic.co/guide/en/elasticsearch/reference/current/snapshots-take-snapshot.html). Underneath we will walk you through a minimal reproducible example.
 
 ### Mount back-up volume
 
-When deploying Elastic, we need to define a `path.repo` via setting this as an environment variable in your `docker-compose.yml` or by setting this in your `elasticsearch.yml`, and passing this as config. Additionally, we need to pass the same `path.repo` to a mounted volume. By default, we set this `elasticdata:/usr/share/elasticsearch/backups` because the `elasticsearch` user needs to have full permisions to act on the repo. Hence, setting the volume to something different might require some additional permission configurations. Note that the `minimum_master_nodes` need to be explicitly set when bound on a public IP.
+When deploying Elastic, we need to define a `path.repo` via setting this as an environment variable in your `docker-compose.yaml` or by setting this in your `elasticsearch.yml`, and passing this as config. Additionally, we need to pass the same `path.repo` to a mounted volume. By default, we set this `elasticdata:/usr/share/elasticsearch/backups` because the `elasticsearch` user needs to have full permisions to act on the repo. Hence, setting the volume to something different might require some additional permission configurations. Note that the `minimum_master_nodes` need to be explicitly set when bound on a public IP.
 
-#### `docker-compose.yml`
+#### `docker-compose.yaml`
 
 ```yaml
 elasticsearch:
@@ -93,7 +93,7 @@ Within our designated `path.repo`, we can now create a snapshot repository, whic
 
 Go to your Kibana `host:p/app/management/data/snapshot_restore/repositories`, on localhost go [here](http://localhost:5601/app/management/data/snapshot_restore/repositories). Press `Register a repository` and set the repo name to whatever you like, in our example we will use `argilla-kibana`. Additionally, we will choose the default option of using a shared file system.
 
-![add repo](../../_static/images/installation/elastic_snapshots/add_repo.png)
+![add repo](../../../../../../_static/images/installation/elastic_snapshots/add_repo.png)
 
 Next, we need to fill out the `path.repo` and set it to `/usr/share/elasticsearch/backups`. Additionally we can pass some configuration to reduce the load on the cluster caused by backups by defining chunking and byte processing sizes, but for this toy example we will leave them empty.
 #### cURL
@@ -117,7 +117,7 @@ Next, we can verify the creation of the back-up.
 curl -X GET "localhost:9200/_snapshot/argilla-curl?pretty"
 ```
 
-![add repo success](../../_static/images/installation/elastic_snapshots/add_repo_succes.png)
+![add repo success](../../../../../../_static/images/installation/elastic_snapshots/add_repo_succes.png)
 
 ### Create snapshot policy
 
@@ -127,7 +127,7 @@ Now we have defined wehere snapshot are going to be stored, we can continue with
 
 Go to your Kibana `host:ip/app/management/data/snapshot_restore/add_policy`, on localhost go [here](http://localhost:5601/app/management/data/snapshot_restore/add_policy). Press `Create a policy` and set the repo name to whatever you like, in our example we will use `argilla-kibana-policy` and execute it on the `argilla-kibana` repo. Also, there are some config options about retention, snapshot naming and scheduling, that we will not discuss in-depth, but underneath you can find a minimal example.
 
-![add policy](../../_static/images/installation/elastic_snapshots/add_policy.png)
+![add policy](../../../../../../_static/images/installation/elastic_snapshots/add_policy.png)
 #### cURL
 
 If you Elastic IP is public, it is possible to directly use a `cURL` to create a repo. If not, we first need to `SSH` into the cluster before calling the `cURL` command. In our example we will define an `argilla-curl-policy` and execute it on the `argilla-curl` repo.
@@ -150,4 +150,4 @@ curl -X PUT "localhost:9200/_slm/policy/argilla-curl-policy?pretty" -H 'Content-
 '
 ```
 
-![add policy success](../../_static/images/installation/elastic_snapshots/add_policy_succes.png)
+![add policy success](../../../../../../_static/images/installation/elastic_snapshots/add_policy_succes.png)
diff --git a/docs/_source/getting_started/installation/configurations/server_configuration.md b/docs/_source/getting_started/installation/configurations/server_configuration.md
@@ -32,6 +32,10 @@ You can set following environment variables to further configure your server and
 
 ### Server
 
+- `ARGILLA_HOME_PATH`: The directory where Argilla will store all the files needed to run. If the path doesn't exists it will be automatically created (Default: `~/.argilla`).
+
+- `ARGILLA_DATABASE_URL`: A URL string that contains the necessary information to connect to a database. Argilla uses SQLite by default, PostgreSQL is also officially supported (Default: `sqlite:///$ARGILLA_HOME_PATH/argilla.db?check_same_thread=False`).
+
 - `ARGILLA_ELASTICSEARCH`: URL of the connection endpoint of the Elasticsearch instance (Default: `http://localhost:9200`).
 
 - `ARGILLA_ELASTICSEARCH_SSL_VERIFY`: If "False", disables SSL certificate verification when connection to the Elasticsearch backend.
-Original file line number
+Diff line change
@@ Expand Up / @@ -19,7 +19,7 @@ Deploy your own Argilla Server on Spaces with a few clicks: @@
     🆕 Use embeddings to find the most similar records with the UI. This feature uses vector search combined with traditional search (keyword and filter based).
-    Get started: [Semantic Search Deep-dive guide](../guides/features/semantic-search.ipynb)
+    Get started: [Semantic Search Deep-dive guide](../guides/label_records_with_semanticsearch.ipynb)
     ```
@@ Expand Down @@