From 7607c7552d500d90c537ca54dfdd82851ebadc3e Mon Sep 17 00:00:00 2001 From: "Dilyara Zharikova (Baymurzina)" Date: Wed, 22 Nov 2023 09:02:14 +0300 Subject: [PATCH] feat: add mistral 7b 128k (#589) * feat: add mistral 7b 128k * fix: model name * fix: model name * fix: configs * fix: config names and cards * fix: more cards * fix: mem * fix: no proxy available --- MODELS.md | 1 + .../document_based_qa_transformers/cpu.yml | 10 + .../db_conf.json | 6 + .../document_based_qa_transformers/dev.yml | 52 ++++ .../docker-compose.override.yml | 158 ++++++++++ .../pipeline_conf.json | 278 ++++++++++++++++++ .../document_based_qa_transformers/proxy.yml | 29 ++ .../telegram.yml | 17 ++ .../document_based_qa_transformers/test.yml | 48 +++ .../transformers_mistral.json | 8 + components.tsv | 6 + components/dfghndoifg023rn2.yml | 27 ++ components/dfjksfoiu2093rn2oeif09.yml | 32 ++ components/dfkjdfboi3esv2quyw3.yml | 32 ++ components/fwjh092j3rnrfy6.yml | 29 ++ .../environment.yml | 7 + .../service.yml | 18 ++ services/transformers_lm/Dockerfile | 2 + services/transformers_lm/server.py | 8 + .../environment.yml | 7 + .../service.yml | 44 +++ .../dff-document-qa-llm-skill/service.yml | 1 + .../environment.yml | 8 + .../service.yml | 31 ++ 24 files changed, 859 insertions(+) create mode 100644 assistant_dists/document_based_qa_transformers/cpu.yml create mode 100644 assistant_dists/document_based_qa_transformers/db_conf.json create mode 100644 assistant_dists/document_based_qa_transformers/dev.yml create mode 100644 assistant_dists/document_based_qa_transformers/docker-compose.override.yml create mode 100644 assistant_dists/document_based_qa_transformers/pipeline_conf.json create mode 100644 assistant_dists/document_based_qa_transformers/proxy.yml create mode 100644 assistant_dists/document_based_qa_transformers/telegram.yml create mode 100644 assistant_dists/document_based_qa_transformers/test.yml create mode 100644 common/generative_configs/transformers_mistral.json create mode 100644 components/dfghndoifg023rn2.yml create mode 100644 components/dfjksfoiu2093rn2oeif09.yml create mode 100644 components/dfkjdfboi3esv2quyw3.yml create mode 100644 components/fwjh092j3rnrfy6.yml create mode 100644 services/agent_services/service_configs/document_based_qa_transformers/environment.yml create mode 100644 services/agent_services/service_configs/document_based_qa_transformers/service.yml create mode 100644 services/transformers_lm/service_configs/transformers-lm-mistral-7b-128k/environment.yml create mode 100644 services/transformers_lm/service_configs/transformers-lm-mistral-7b-128k/service.yml create mode 100644 skills/dff_document_qa_llm_skill/service_configs/dff-document-qa-transformers-llm-skill/environment.yml create mode 100644 skills/dff_document_qa_llm_skill/service_configs/dff-document-qa-transformers-llm-skill/service.yml diff --git a/MODELS.md b/MODELS.md index 8588da5e33..793e7dbe48 100644 --- a/MODELS.md +++ b/MODELS.md @@ -14,3 +14,4 @@ | [Anthropic Claude Instant v1](https://docs.anthropic.com/claude/reference/complete_post) | anthropic-api-claude-instant-v1 | no (paid access via API) | supposedly, 52B | - (cannot be run locally) | 9,000 tokens | available under subscription plan, commercial use allowed | A smaller model with far lower latency, sampling at roughly 40 words/sec! Its output quality is somewhat lower than the latest claude-1 model, particularly for complex tasks. However, it is much less expensive and blazing fast. NB: paid. You must provide your Anthropic API key to use the model. Your Anthropic API account will be charged according to your usage. | | Russian XGLM 4.5B (private weights) | transformers-lm-ruxglm | no | 4.5B | 15GB | 2,048 tokens | Not available yet | A private large language model for the Russian language which was fine-tuned for instruction following by Dmitry Kosenko in Summer 2023. This model is up and running on our servers and can be used for free. | | [ruGPT-3.5-13B](https://huggingface.co/ai-forever/ruGPT-3.5-13B) | transformers-lm-rugpt35 | yes | 13B | 35GB (half-precision) | 2,048 tokens | MIT | A large language model for the Russian language which was used for trainig GigaChat. This model is up and running on our servers and can be used for free. | +| [Mistral 7B 128k Tokens](https://huggingface.co/NousResearch/Yarn-Mistral-7b-128k) | transformers-lm-mistral-7b-128k | yes | 7B | 20GB (half-precision) | 128,000 tokens | Apache 2.0 , commercial use is allowed | An open-source English-only large language model which was fine-tuned for instruction following but is NOT capable of code generation. NB: free of charge. This model is up and running on our servers and can be used for free. | diff --git a/assistant_dists/document_based_qa_transformers/cpu.yml b/assistant_dists/document_based_qa_transformers/cpu.yml new file mode 100644 index 0000000000..22cb91c24f --- /dev/null +++ b/assistant_dists/document_based_qa_transformers/cpu.yml @@ -0,0 +1,10 @@ +version: '3.7' +services: + combined-classification: + environment: + DEVICE: cpu + CUDA_VISIBLE_DEVICES: "" + sentence-ranker: + environment: + DEVICE: cpu + CUDA_VISIBLE_DEVICES: "" \ No newline at end of file diff --git a/assistant_dists/document_based_qa_transformers/db_conf.json b/assistant_dists/document_based_qa_transformers/db_conf.json new file mode 100644 index 0000000000..a9ba6813f5 --- /dev/null +++ b/assistant_dists/document_based_qa_transformers/db_conf.json @@ -0,0 +1,6 @@ +{ + "host": "DB_HOST", + "port": "DB_PORT", + "name": "DB_NAME", + "env": true +} \ No newline at end of file diff --git a/assistant_dists/document_based_qa_transformers/dev.yml b/assistant_dists/document_based_qa_transformers/dev.yml new file mode 100644 index 0000000000..83e9f54c7e --- /dev/null +++ b/assistant_dists/document_based_qa_transformers/dev.yml @@ -0,0 +1,52 @@ +# С такими volumes удобно дебажить, не нужно пересобирать контейнер каждый раз при изменении кода +services: + agent: + volumes: + - ".:/dp-agent" + ports: + - 4242:4242 + files: + ports: + - 3000:3000 + volumes: + - "~/.deeppavlov/file_server:/tmp" + sentseg: + volumes: + - "./annotators/SentSeg:/src" + ports: + - 8011:8011 + combined-classification: + volumes: + - "./common:/src/common" + - "./annotators/combined_classification:/src" + ports: + - 8087:8087 + sentence-ranker: + volumes: + - "./services/sentence_ranker:/src" + - "~/.deeppavlov/cache:/root/.cache" + ports: + - 8128:8128 + transformers-lm-mistral-7b-128k: + volumes: + - "./services/transformers_lm:/src" + - "./common:/src/common" + - "~/.deeppavlov/cache:/root/.cache" + ports: + - 8185:8185 + doc-retriever: + volumes: + - "./annotators/doc_retriever:/src" + - "./common:/src/common" + - "./documents:/src/documents" + ports: + - 8165:8165 + dff-document-qa-transformers-llm-skill: + volumes: + - "./skills/dff_document_qa_llm_skill:/src" + - "./common:/src/common" + - "./documents:/src/documents" + ports: + - 8186:8186 + +version: "3.7" diff --git a/assistant_dists/document_based_qa_transformers/docker-compose.override.yml b/assistant_dists/document_based_qa_transformers/docker-compose.override.yml new file mode 100644 index 0000000000..f626ecc3b2 --- /dev/null +++ b/assistant_dists/document_based_qa_transformers/docker-compose.override.yml @@ -0,0 +1,158 @@ +services: + agent: + command: sh -c 'bin/wait && python -m deeppavlov_agent.run agent.pipeline_config=assistant_dists/document_based_qa_transformers/pipeline_conf.json' + environment: + WAIT_HOSTS: "sentseg:8011, combined-classification:8087, ranking-based-response-selector:8002, + sentence-ranker:8128, transformers-lm-mistral-7b-128k:8185, doc-retriever:8165, dff-document-qa-transformers-llm-skill:8186" + WAIT_HOSTS_TIMEOUT: ${WAIT_TIMEOUT:-1000} + HIGH_PRIORITY_INTENTS: 1 + RESTRICTION_FOR_SENSITIVE_CASE: 1 + ALWAYS_TURN_ON_ALL_SKILLS: 0 + LANGUAGE: EN + + files: + image: julienmeerschart/simple-file-upload-download-server + + sentseg: + env_file: [ .env ] + build: + context: ./annotators/SentSeg/ + dockerfile: Dockerfile-test + command: flask run -h 0.0.0.0 -p 8011 + environment: + - FLASK_APP=server + deploy: + resources: + limits: + memory: 1.5G + reservations: + memory: 1.5G + + doc-retriever: + env_file: [ .env ] + build: + context: . + dockerfile: ./annotators/doc_retriever/Dockerfile + args: + SERVICE_PORT: 8165 + SERVICE_NAME: doc_retriever + CONFIG_PATH: ./doc_retriever_config.json + DOC_PATH_OR_LINK: http://files.deeppavlov.ai/dream_data/documents_for_qa/test_file_dream_repo.html,http://files.deeppavlov.ai/dream_data/documents_for_qa/alphabet_financial_report.txt,http://files.deeppavlov.ai/dream_data/documents_for_qa/test_file_jurafsky_chatbots.pdf + PARAGRAPHS_NUM: 5 + FILE_SERVER_TIMEOUT: 30 + command: python -m flask run -h 0.0.0.0 -p 8165 + environment: + - FLASK_APP=server + - CUDA_VISIBLE_DEVICES=0 + deploy: + resources: + limits: + memory: 5G + reservations: + memory: 5G + + combined-classification: + env_file: [ .env ] + build: + args: + CONFIG: combined_classifier.json + SERVICE_PORT: 8087 + context: . + dockerfile: ./annotators/combined_classification/Dockerfile + command: gunicorn --workers=1 server:app -b 0.0.0.0:8087 --timeout 600 + environment: + - CUDA_VISIBLE_DEVICES=0 + deploy: + resources: + limits: + memory: 2G + reservations: + memory: 2G + + ranking-based-response-selector: + env_file: [ .env ] + build: + args: + SERVICE_PORT: 8002 + SERVICE_NAME: response_selector + LANGUAGE: EN + SENTENCE_RANKER_ANNOTATION_NAME: sentence_ranker + SENTENCE_RANKER_SERVICE_URL: http://sentence-ranker:8128/respond + SENTENCE_RANKER_TIMEOUT: 3 + N_UTTERANCES_CONTEXT: 5 + FILTER_TOXIC_OR_BADLISTED: 1 + context: . + dockerfile: ./response_selectors/ranking_based_response_selector/Dockerfile + command: flask run -h 0.0.0.0 -p 8002 + environment: + - FLASK_APP=server + deploy: + resources: + limits: + memory: 100M + reservations: + memory: 100M + + sentence-ranker: + env_file: [ .env ] + build: + args: + SERVICE_PORT: 8128 + SERVICE_NAME: sentence_ranker + PRETRAINED_MODEL_NAME_OR_PATH: sentence-transformers/all-MiniLM-L6-v2 + context: ./services/sentence_ranker/ + command: flask run -h 0.0.0.0 -p 8128 + environment: + - CUDA_VISIBLE_DEVICES=0 + - FLASK_APP=server + deploy: + resources: + limits: + memory: 3G + reservations: + memory: 3G + + transformers-lm-mistral-7b-128k: + env_file: [ .env ] + build: + args: + SERVICE_PORT: 8185 + SERVICE_NAME: transformers_lm_mistral_7b_128k + PRETRAINED_MODEL_NAME_OR_PATH: NousResearch/Yarn-Mistral-7b-128k + HALF_PRECISION: 1 + USE_FLASH_ATTENTION_2: 1 + context: . + dockerfile: ./services/transformers_lm/Dockerfile + command: flask run -h 0.0.0.0 -p 8185 + environment: + - CUDA_VISIBLE_DEVICES=0 + - FLASK_APP=server + deploy: + resources: + limits: + memory: 50G + reservations: + memory: 50G + + dff-document-qa-transformers-llm-skill: + env_file: [ .env ] + build: + args: + SERVICE_PORT: 8186 + SERVICE_NAME: dff_document_qa_llm_skill + GENERATIVE_SERVICE_URL: http://transformers-lm-mistral-7b-128k:8185/respond + GENERATIVE_SERVICE_CONFIG: transformers_mistral.json + GENERATIVE_TIMEOUT: 120 + N_UTTERANCES_CONTEXT: 7 + FILE_SERVER_TIMEOUT: 30 + DOCUMENT_PROMPT_FILE: common/prompts/document_qa_instruction.json + context: . + dockerfile: ./skills/dff_document_qa_llm_skill/Dockerfile + deploy: + resources: + limits: + memory: 128M + reservations: + memory: 128M + +version: '3.7' diff --git a/assistant_dists/document_based_qa_transformers/pipeline_conf.json b/assistant_dists/document_based_qa_transformers/pipeline_conf.json new file mode 100644 index 0000000000..466984d51d --- /dev/null +++ b/assistant_dists/document_based_qa_transformers/pipeline_conf.json @@ -0,0 +1,278 @@ +{ + "connectors": { + "sentseg": { + "protocol": "http", + "timeout": 1.5, + "url": "http://sentseg:8011/sentseg" + } + }, + "services": { + "last_chance_service": { + "connector": { + "protocol": "python", + "class_name": "PredefinedTextConnector", + "response_text": "Sorry, something went wrong inside. Please tell me, what did you say.", + "annotations": { + "sentseg": { + "punct_sent": "Sorry, something went wrong inside. Please tell me, what did you say.", + "segments": [ + "Sorry, something went wrong inside.", + "Please tell me, what did you say." + ] + } + } + }, + "state_manager_method": "add_bot_utterance_last_chance", + "tags": [ + "last_chance" + ], + "source": { + "component": "components/dfkjdfboi3esv2quyw3.yml", + "service": "services/agent_services/service_configs/document_based_qa_transformers" + } + }, + "timeout_service": { + "connector": { + "protocol": "python", + "class_name": "PredefinedTextConnector", + "response_text": "Sorry, I need to think more on that. Let's talk about something else.", + "annotations": { + "sentseg": { + "punct_sent": "Sorry, I need to think more on that. Let's talk about something else.", + "segments": [ + "Sorry, I need to think more on that.", + "Let's talk about something else." + ] + } + } + }, + "state_manager_method": "add_bot_utterance_last_chance", + "tags": [ + "timeout" + ], + "source": { + "component": "components/dfjksfoiu2093rn2oeif09.yml", + "service": "services/agent_services/service_configs/document_based_qa_transformers" + } + }, + "response_annotator_selectors": { + "connector": { + "protocol": "python", + "class_name": "skill_selectors.post_annotator_selector.connector:PostAnnotatorSelectorConnector", + "annotator_names": [ + "sentseg" + ] + }, + "response_formatter": "state_formatters.dp_formatters:simple_formatter_service", + "tags": [ + "selector" + ], + "is_enabled": true, + "source": { + "component": "components/LXrJDIf43gwNmPMNXG5Eg.yml", + "service": "services/response_annotator_selectors/service_configs/agent" + } + }, + "response_annotators": { + "sentseg": { + "connector": { + "protocol": "http", + "timeout": 1.5, + "url": "http://sentseg:8011/sentseg" + }, + "dialog_formatter": "state_formatters.dp_formatters:last_bot_utt_dialog", + "response_formatter": "state_formatters.dp_formatters:simple_formatter_service", + "previous_services": [ + "response_annotator_selectors" + ], + "state_manager_method": "add_annotation_prev_bot_utt", + "is_enabled": true, + "source": { + "component": "components/1Q9QXih1U2zhCpVm9zxdsA.yml", + "service": "annotators/SentSeg/service_configs/sentseg" + } + } + }, + "annotators": { + "sentseg": { + "connector": { + "protocol": "http", + "timeout": 1.5, + "url": "http://sentseg:8011/sentseg" + }, + "dialog_formatter": "state_formatters.dp_formatters:preproc_last_human_utt_dialog", + "response_formatter": "state_formatters.dp_formatters:simple_formatter_service", + "previous_services": [], + "state_manager_method": "add_annotation", + "is_enabled": true, + "source": { + "component": "components/gM4fEjvVqLlSRRRkQfds2g.yml", + "service": "annotators/SentSeg/service_configs/sentseg" + } + }, + "train_and_upload_model": { + "connector": { + "protocol": "http", + "timeout": 50, + "url": "http://doc-retriever:8165/train_and_upload_model" + }, + "dialog_formatter": "state_formatters.dp_formatters:utt_non_punct_dialog", + "response_formatter": "state_formatters.dp_formatters:simple_formatter_service", + "previous_services": [], + "state_manager_method": "update_attributes", + "is_enabled": true, + "source": { + "component": "components/jkb98534UUIjfdn67IN.yml", + "service": "annotators/doc_retriever/service_configs/doc-retriever" + } + }, + "doc_retriever": { + "connector": { + "protocol": "http", + "timeout": 30, + "url": "http://doc-retriever:8165/return_candidates" + }, + "dialog_formatter": "state_formatters.dp_formatters:utt_sentseg_punct_dialog", + "response_formatter": "state_formatters.dp_formatters:simple_formatter_service", + "previous_services": [ + "annotators.train_and_upload_model" + ], + "state_manager_method": "add_annotation", + "is_enabled": true, + "source": { + "component": "components/nlknkn768678DUFYbcjk.yml", + "service": "annotators/doc_retriever/service_configs/doc-retriever" + } + } + }, + "skill_selectors": { + "description_based_skill_selector": { + "connector": { + "protocol": "python", + "class_name": "skill_selectors.description_based_skill_selector.connector:DescriptionBasedSkillSelectorConnector" + }, + "dialog_formatter": "state_formatters.dp_formatters:base_skill_selector_formatter_dialog", + "response_formatter": "state_formatters.dp_formatters:simple_formatter_service", + "previous_services": [ + "annotators" + ], + "tags": [ + "selector" + ], + "is_enabled": true, + "source": { + "component": "components/dfsw4bji8bgjq2.yml", + "service": "skill_selectors/description_based_skill_selector/service_configs/agent" + } + } + }, + "skills": { + "dff_document_qa_llm_skill": { + "connector": { + "protocol": "http", + "timeout": 120.0, + "url": "http://dff-document-qa-transformers-llm-skill:8186/respond" + }, + "dialog_formatter": "state_formatters.dp_formatters:dff_prompted_skill_formatter", + "response_formatter": "state_formatters.dp_formatters:skill_with_attributes_formatter_service", + "previous_services": [ + "skill_selectors" + ], + "state_manager_method": "add_hypothesis", + "is_enabled": true, + "source": { + "component": "components/fwjh092j3rnrfy6.yml", + "service": "skills/dff_document_qa_llm_skill/service_configs/dff-document-qa-transformers-llm-skill" + } + }, + "dummy_skill": { + "connector": { + "protocol": "python", + "class_name": "skills.dummy_skill.connector:DummySkillConnector" + }, + "dialog_formatter": "state_formatters.dp_formatters:utt_sentrewrite_modified_last_dialog", + "response_formatter": "state_formatters.dp_formatters:skill_with_attributes_formatter_service", + "previous_services": [ + "skill_selectors" + ], + "state_manager_method": "add_hypothesis", + "is_enabled": true, + "source": { + "component": "components/uYkoK0vRp4bbIg9akI1yw.yml", + "service": "skills/dummy_skill/service_configs/agent" + } + } + }, + "candidate_annotators": { + "combined_classification": { + "connector": { + "protocol": "http", + "timeout": 2.0, + "url": "http://combined-classification:8087/batch_model" + }, + "dialog_formatter": "state_formatters.dp_formatters:hypothesis_histories_list", + "response_formatter": "state_formatters.dp_formatters:simple_formatter_service", + "previous_services": [ + "skills" + ], + "state_manager_method": "add_hypothesis_annotation_batch", + "is_enabled": true, + "source": { + "component": "components/PbLNvh4hrvs47rPaf2bfYQ.yml", + "service": "annotators/combined_classification/service_configs/combined-classification" + } + }, + "sentence_ranker": { + "connector": { + "protocol": "http", + "timeout": 1.0, + "url": "http://sentence-ranker:8128/respond" + }, + "dialog_formatter": "state_formatters.dp_formatters:sentence_ranker_formatter", + "response_formatter": "state_formatters.dp_formatters:simple_formatter_service", + "previous_services": [ + "skills" + ], + "state_manager_method": "add_hypothesis_annotation_batch", + "is_enabled": true, + "source": { + "component": "components/XGwmAHtAOu0NDqqG3QCJw.yml", + "service": "services/sentence_ranker/service_configs/sentence-ranker" + } + } + }, + "response_selectors": { + "response_selector": { + "connector": { + "protocol": "http", + "timeout": 1.0, + "url": "http://ranking-based-response-selector:8002/respond" + }, + "dialog_formatter": "state_formatters.dp_formatters:cropped_dialog", + "response_formatter": "state_formatters.dp_formatters:base_response_selector_formatter_service", + "previous_services": [ + "candidate_annotators" + ], + "tags": [ + "selector" + ], + "state_manager_method": "add_bot_utterance", + "is_enabled": true, + "source": { + "component": "components/YJzc7NwGrLmKp6gfZJh7X1.yml", + "service": "response_selectors/ranking_based_response_selector/service_configs/ranking-based-response-selector" + } + } + } + }, + "metadata": { + "display_name": "Transformers-based Document QA", + "author": "DeepPavlov", + "description": "This assistant uses the power of Transformers LLMs to answer your questions based on the document you provide.", + "version": "0.0.1", + "date_created": "2023-01-10T02:00:00", + "ram_usage": "9 GB", + "gpu_usage": "3 GB", + "disk_usage": "10 GB" + } +} \ No newline at end of file diff --git a/assistant_dists/document_based_qa_transformers/proxy.yml b/assistant_dists/document_based_qa_transformers/proxy.yml new file mode 100644 index 0000000000..e2d988f10a --- /dev/null +++ b/assistant_dists/document_based_qa_transformers/proxy.yml @@ -0,0 +1,29 @@ +services: + combined-classification: + command: ["nginx", "-g", "daemon off;"] + build: + context: dp/proxy/ + dockerfile: Dockerfile + environment: + - PROXY_PASS=proxy.deeppavlov.ai:8087 + - PORT=8087 + + sentseg: + command: ["nginx", "-g", "daemon off;"] + build: + context: dp/proxy/ + dockerfile: Dockerfile + environment: + - PROXY_PASS=proxy.deeppavlov.ai:8011 + - PORT=8011 + + sentence-ranker: + command: ["nginx", "-g", "daemon off;"] + build: + context: dp/proxy/ + dockerfile: Dockerfile + environment: + - PROXY_PASS=proxy.deeppavlov.ai:8128 + - PORT=8128 + +version: "3.7" diff --git a/assistant_dists/document_based_qa_transformers/telegram.yml b/assistant_dists/document_based_qa_transformers/telegram.yml new file mode 100644 index 0000000000..94181901c7 --- /dev/null +++ b/assistant_dists/document_based_qa_transformers/telegram.yml @@ -0,0 +1,17 @@ +services: + agent-tg: + command: sh -c 'bin/wait && python -m deeppavlov_agent.run agent.channel=telegram agent.telegram_token=$TG_TOKEN agent.pipeline_config=assistant_dists/document_based_qa_transformers/pipeline_conf.json agent.db_config=assistant_dists/document_based_qa_transformers/db_conf.json' + env_file: [.env] + build: + context: ./ + dockerfile: dockerfile_agent + deploy: + resources: + limits: + memory: 4G + reservations: + memory: 2G + volumes: + - ".:/dp-agent" + +version: '3.7' diff --git a/assistant_dists/document_based_qa_transformers/test.yml b/assistant_dists/document_based_qa_transformers/test.yml new file mode 100644 index 0000000000..224c21d5ca --- /dev/null +++ b/assistant_dists/document_based_qa_transformers/test.yml @@ -0,0 +1,48 @@ +services: + agent: + volumes: + - "/cephfs/home/ignatov/artifacts:/output" + ports: + - ${AGENT_PORT}:4242 + mongo: + command: mongod + image: mongo:4.0.0 + files: + volumes: + - "~/.deeppavlov/file_server:/tmp" + ranking-based-response-selector: + volumes: + - "./response_selectors/ranking_based_response_selector:/src" + - "./common:/src/common" + combined-classification: + volumes: + - "~/.deeppavlov:/root/.deeppavlov" + - "~/.deeppavlov/cache:/root/.cache" + environment: + - CUDA_VISIBLE_DEVICES=8 + sentseg: + sentence-ranker: + volumes: + - "./services/sentence_ranker:/src" + - "~/.deeppavlov/cache:/root/.cache" + environment: + - CUDA_VISIBLE_DEVICES=7 + transformers-lm-mistral-7b-128k: + volumes: + - "./services/transformers_lm:/src" + - "./common:/src/common" + - "~/.deeppavlov/cache:/root/.cache" + environment: + - CUDA_VISIBLE_DEVICES=1 + doc-retriever: + volumes: + - "./annotators/doc_retriever:/src" + - "./common:/src/common" + - "./documents:/src/documents" + dff-document-qa-transformers-llm-skill: + volumes: + - "./skills/dff_document_qa_llm_skill:/src" + - "./common:/src/common" + - "./documents:/src/documents" + +version: "3.7" diff --git a/common/generative_configs/transformers_mistral.json b/common/generative_configs/transformers_mistral.json new file mode 100644 index 0000000000..6d3ba0c22d --- /dev/null +++ b/common/generative_configs/transformers_mistral.json @@ -0,0 +1,8 @@ +{ + "max_new_tokens": 4096, + "min_new_tokens": 8, + "top_p": 0.9, + "temperature": 0.9, + "do_sample": true, + "num_return_sequences": 2 +} \ No newline at end of file diff --git a/components.tsv b/components.tsv index 2865f696cd..396aad01df 100644 --- a/components.tsv +++ b/components.tsv @@ -188,3 +188,9 @@ 8182 8183 external-integration-skill 8184 external-fake-server +8185 transformers-mistral-7b-128k +8186 dff-document-qa-transformers-llm-skill +8187 +8188 +8189 +8190 \ No newline at end of file diff --git a/components/dfghndoifg023rn2.yml b/components/dfghndoifg023rn2.yml new file mode 100644 index 0000000000..ba63cb8fe8 --- /dev/null +++ b/components/dfghndoifg023rn2.yml @@ -0,0 +1,27 @@ +name: transformers_lm_mistral_7b_128k +display_name: Mistral 7B 128k Tokens +component_type: Generative +model_type: NN-based +is_customizable: false +author: publisher@deeppavlov.ai +description: An open-source English-only large language model which was fine-tuned + for instruction following but is NOT capable of code generation. + For more details, refer to + [HuggingFace Model Page](https://huggingface.co/NousResearch/Yarn-Mistral-7b-128k). + Free of charge. This model is up and running on our servers and can be used for free. +ram_usage: 50G +gpu_usage: 20G +group: services +connector: + protocol: http + timeout: 20.0 + url: http://transformers-lm-mistral-7b-128k:8185/respond +dialog_formatter: null +response_formatter: null +previous_services: null +required_previous_services: null +state_manager_method: null +tags: null +endpoint: respond +service: services/transformers_lm/service_configs/transformers-lm-mistral-7b-128k +date_created: '2023-04-16T09:45:32' diff --git a/components/dfjksfoiu2093rn2oeif09.yml b/components/dfjksfoiu2093rn2oeif09.yml new file mode 100644 index 0000000000..a7599947e1 --- /dev/null +++ b/components/dfjksfoiu2093rn2oeif09.yml @@ -0,0 +1,32 @@ +name: timeout_service +display_name: Timeout Service +component_type: null +model_type: null +is_customizable: false +author: publisher@deeppavlov.ai +description: Timeout Service +ram_usage: 100M +gpu_usage: null +group: timeout_service +connector: + protocol: python + class_name: PredefinedTextConnector + response_text: Sorry, something went wrong inside. Please tell me, what did you + say. + annotations: + sentseg: + punct_sent: Sorry, something went wrong inside. Please tell me, what did you + say. + segments: + - Sorry, something went wrong inside. + - Please tell me, what did you say. +dialog_formatter: null +response_formatter: null +previous_services: null +required_previous_services: null +state_manager_method: add_bot_utterance_last_chance +tags: +- timeout +endpoint: respond +service: services/agent_services/service_configs/document_based_qa_transformers +date_created: '2023-03-04T19:27:44' diff --git a/components/dfkjdfboi3esv2quyw3.yml b/components/dfkjdfboi3esv2quyw3.yml new file mode 100644 index 0000000000..b27b290705 --- /dev/null +++ b/components/dfkjdfboi3esv2quyw3.yml @@ -0,0 +1,32 @@ +name: last_chance_service +display_name: Last Chance Service +component_type: null +model_type: null +is_customizable: false +author: publisher@deeppavlov.ai +description: Last Chance Service +ram_usage: 100M +gpu_usage: null +group: last_chance_service +connector: + protocol: python + class_name: PredefinedTextConnector + response_text: Sorry, something went wrong inside. Please tell me, what did you + say. + annotations: + sentseg: + punct_sent: Sorry, something went wrong inside. Please tell me, what did you + say. + segments: + - Sorry, something went wrong inside. + - Please tell me, what did you say. +dialog_formatter: null +response_formatter: null +previous_services: null +required_previous_services: null +state_manager_method: add_bot_utterance_last_chance +tags: +- last_chance +endpoint: respond +service: services/agent_services/service_configs/document_based_qa_transformers +date_created: '2023-10-13T19:27:44' diff --git a/components/fwjh092j3rnrfy6.yml b/components/fwjh092j3rnrfy6.yml new file mode 100644 index 0000000000..d2f5d96c35 --- /dev/null +++ b/components/fwjh092j3rnrfy6.yml @@ -0,0 +1,29 @@ +name: dff_document_qa_llm_skill +display_name: LLM-based Q&A on Documents Skill using Transformers LLM +component_type: Generative +model_type: NN-based +is_customizable: false +author: publisher@deeppavlov.ai +description: + Transformers-based generative skill that answers the user's questions about a given document. + Uses doc_retriever to select the most relevant parts of the document and Transformers LLM + to generate the response based on the context, the question, and the selected parts. +ram_usage: 150M +gpu_usage: null +group: skills +connector: + protocol: http + timeout: 120.0 + url: http://dff-document-qa-transformers-llm-skill:8186/respond +dialog_formatter: + name: state_formatters.dp_formatters:dff_prompted_skill_formatter + skill_name: dff_document_qa_llm_skill +response_formatter: state_formatters.dp_formatters:skill_with_attributes_formatter_service +previous_services: + - skill_selectors +required_previous_services: null +state_manager_method: add_hypothesis +tags: null +endpoint: respond +service: skills/dff_document_qa_llm_skill/service_configs/dff-document-qa-transformers-llm-skill +date_created: "2023-03-16T09:45:32" diff --git a/services/agent_services/service_configs/document_based_qa_transformers/environment.yml b/services/agent_services/service_configs/document_based_qa_transformers/environment.yml new file mode 100644 index 0000000000..ec5b8be2ba --- /dev/null +++ b/services/agent_services/service_configs/document_based_qa_transformers/environment.yml @@ -0,0 +1,7 @@ +WAIT_HOSTS: '' +WAIT_HOSTS_TIMEOUT: ${WAIT_TIMEOUT:-1000} +HIGH_PRIORITY_INTENTS: 1 +RESTRICTION_FOR_SENSITIVE_CASE: 1 +ALWAYS_TURN_ON_ALL_SKILLS: 0 +LANGUAGE: EN +FALLBACK_FILE: fallbacks_dream_en.json diff --git a/services/agent_services/service_configs/document_based_qa_transformers/service.yml b/services/agent_services/service_configs/document_based_qa_transformers/service.yml new file mode 100644 index 0000000000..e50a3d119d --- /dev/null +++ b/services/agent_services/service_configs/document_based_qa_transformers/service.yml @@ -0,0 +1,18 @@ +name: agent +endpoints: +- respond +compose: + command: sh -c 'bin/wait && python -m deeppavlov_agent.run agent.pipeline_config=assistant_dists/document_based_qa_transformers/pipeline_conf.json' + environment: + WAIT_HOSTS: '' + WAIT_HOSTS_TIMEOUT: ${WAIT_TIMEOUT:-1000} + HIGH_PRIORITY_INTENTS: 1 + RESTRICTION_FOR_SENSITIVE_CASE: 1 + ALWAYS_TURN_ON_ALL_SKILLS: 0 + LANGUAGE: EN + FALLBACK_FILE: fallbacks_dream_en.json + volumes: + - .:/dp-agent + ports: + - 4242:4242 +proxy: null diff --git a/services/transformers_lm/Dockerfile b/services/transformers_lm/Dockerfile index ef2c4203ff..8e6cfd539e 100644 --- a/services/transformers_lm/Dockerfile +++ b/services/transformers_lm/Dockerfile @@ -12,6 +12,8 @@ ARG HALF_PRECISION ENV HALF_PRECISION ${HALF_PRECISION} ARG ADDITIONAL_EOS_TOKENS ENV ADDITIONAL_EOS_TOKENS ${ADDITIONAL_EOS_TOKENS} +ARG USE_FLASH_ATTENTION_2 +ENV USE_FLASH_ATTENTION_2 ${USE_FLASH_ATTENTION_2} COPY ./services/transformers_lm/requirements.txt /src/requirements.txt diff --git a/services/transformers_lm/server.py b/services/transformers_lm/server.py index def2297c10..50d2d6fb12 100644 --- a/services/transformers_lm/server.py +++ b/services/transformers_lm/server.py @@ -24,6 +24,9 @@ PRETRAINED_MODEL_NAME_OR_PATH = os.environ.get("PRETRAINED_MODEL_NAME_OR_PATH") HALF_PRECISION = os.environ.get("HALF_PRECISION", 0) HALF_PRECISION = 0 if HALF_PRECISION is None else bool(int(HALF_PRECISION)) +USE_FLASH_ATTENTION_2 = os.environ.get("USE_FLASH_ATTENTION_2", 0) +USE_FLASH_ATTENTION_2 = 0 if USE_FLASH_ATTENTION_2 is None else bool(int(USE_FLASH_ATTENTION_2)) + logger.info(f"PRETRAINED_MODEL_NAME_OR_PATH = {PRETRAINED_MODEL_NAME_OR_PATH}") LANGUAGE = os.getenv("LANGUAGE", "EN") HF_ACCESS_TOKEN = os.environ.get("HF_ACCESS_TOKEN", None) @@ -47,6 +50,7 @@ "lmsys/vicuna-13b-v1.3": json.load(open("common/generative_configs/default_generative_config.json", "r")), "dim/xglm-4.5B_ru_v10_epoch_6_step_41141": json.load(open("common/generative_configs/ruxglm_config.json", "r")), "ai-forever/ruGPT-3.5-13B": json.load(open("common/generative_configs/rugpt35_config.json", "r")), + "NousResearch/Yarn-Mistral-7b-128k": json.load(open("common/generative_configs/transformers_mistral.json", "r")), } @@ -155,6 +159,10 @@ def generate_responses(context, model, tokenizer, prompt, generation_params, con if HALF_PRECISION: additional_kwargs["torch_dtype"] = torch.float16 + if USE_FLASH_ATTENTION_2: + additional_kwargs["use_flash_attention_2"] = True + additional_kwargs["trust_remote_code"] = True + model = AutoModelForCausalLM.from_pretrained(PRETRAINED_MODEL_NAME_OR_PATH, **additional_kwargs) if torch.cuda.is_available(): model.to("cuda") diff --git a/services/transformers_lm/service_configs/transformers-lm-mistral-7b-128k/environment.yml b/services/transformers_lm/service_configs/transformers-lm-mistral-7b-128k/environment.yml new file mode 100644 index 0000000000..9baf8649d5 --- /dev/null +++ b/services/transformers_lm/service_configs/transformers-lm-mistral-7b-128k/environment.yml @@ -0,0 +1,7 @@ +SERVICE_PORT: 8185 +SERVICE_NAME: transformers_lm_mistral_7b_128k +PRETRAINED_MODEL_NAME_OR_PATH: NousResearch/Yarn-Mistral-7b-128k +HALF_PRECISION: 1 +USE_FLASH_ATTENTION_2: 1 +CUDA_VISIBLE_DEVICES: '0' +FLASK_APP: server diff --git a/services/transformers_lm/service_configs/transformers-lm-mistral-7b-128k/service.yml b/services/transformers_lm/service_configs/transformers-lm-mistral-7b-128k/service.yml new file mode 100644 index 0000000000..2244010a0f --- /dev/null +++ b/services/transformers_lm/service_configs/transformers-lm-mistral-7b-128k/service.yml @@ -0,0 +1,44 @@ +name: transformers-lm-mistral-7b-128k +endpoints: +- respond +compose: + env_file: + - .env + build: + args: + SERVICE_PORT: 8185 + SERVICE_NAME: transformers_lm_mistral_7b_128k + PRETRAINED_MODEL_NAME_OR_PATH: NousResearch/Yarn-Mistral-7b-128k + HALF_PRECISION: 1 + USE_FLASH_ATTENTION_2: 1 + CUDA_VISIBLE_DEVICES: '0' + FLASK_APP: server + context: . + dockerfile: ./services/transformers_lm/Dockerfile + command: flask run -h 0.0.0.0 -p 8185 + environment: + - CUDA_VISIBLE_DEVICES=0 + - FLASK_APP=server + deploy: + resources: + limits: + memory: 50G + reservations: + memory: 50G + volumes: + - ./services/transformers_lm:/src + - ./common:/src/common + - ~/.deeppavlov/cache:/root/.cache + ports: + - 8185:8185 +proxy: + command: + - nginx + - -g + - daemon off; + build: + context: dp/proxy/ + dockerfile: Dockerfile + environment: + - PROXY_PASS=dream.deeppavlov.ai:8185 + - PORT=8185 diff --git a/skills/dff_document_qa_llm_skill/service_configs/dff-document-qa-llm-skill/service.yml b/skills/dff_document_qa_llm_skill/service_configs/dff-document-qa-llm-skill/service.yml index 9f59a4262b..0adf80814c 100644 --- a/skills/dff_document_qa_llm_skill/service_configs/dff-document-qa-llm-skill/service.yml +++ b/skills/dff_document_qa_llm_skill/service_configs/dff-document-qa-llm-skill/service.yml @@ -4,6 +4,7 @@ endpoints: compose: env_file: - .env + - .env_secret build: args: SERVICE_PORT: 8166 diff --git a/skills/dff_document_qa_llm_skill/service_configs/dff-document-qa-transformers-llm-skill/environment.yml b/skills/dff_document_qa_llm_skill/service_configs/dff-document-qa-transformers-llm-skill/environment.yml new file mode 100644 index 0000000000..eb323f8761 --- /dev/null +++ b/skills/dff_document_qa_llm_skill/service_configs/dff-document-qa-transformers-llm-skill/environment.yml @@ -0,0 +1,8 @@ +SERVICE_PORT: 8186 +SERVICE_NAME: dff_document_qa_llm_skill +GENERATIVE_SERVICE_URL: http://transformers-lm-mistral-7b-128k:8185/respond +GENERATIVE_SERVICE_CONFIG: transformers_mistral.json +GENERATIVE_TIMEOUT: 120 +N_UTTERANCES_CONTEXT: 7 +FILE_SERVER_TIMEOUT: 30 +DOCUMENT_PROMPT_FILE: common/prompts/document_qa_instruction.json diff --git a/skills/dff_document_qa_llm_skill/service_configs/dff-document-qa-transformers-llm-skill/service.yml b/skills/dff_document_qa_llm_skill/service_configs/dff-document-qa-transformers-llm-skill/service.yml new file mode 100644 index 0000000000..07e6af8d1e --- /dev/null +++ b/skills/dff_document_qa_llm_skill/service_configs/dff-document-qa-transformers-llm-skill/service.yml @@ -0,0 +1,31 @@ +name: dff-document-qa-transformers-llm-skill +endpoints: +- respond +compose: + env_file: + - .env + build: + args: + SERVICE_PORT: 8186 + SERVICE_NAME: dff_document_qa_llm_skill + GENERATIVE_SERVICE_URL: http://transformers-lm-mistral-7b-128k:8185/respond + GENERATIVE_SERVICE_CONFIG: transformers_mistral.json + GENERATIVE_TIMEOUT: 120 + N_UTTERANCES_CONTEXT: 7 + FILE_SERVER_TIMEOUT: 30 + DOCUMENT_PROMPT_FILE: common/prompts/document_qa_instruction.json + context: . + dockerfile: ./skills/dff_document_qa_llm_skill/Dockerfile + command: gunicorn --workers=1 server:app -b 0.0.0.0:8186 --reload + deploy: + resources: + limits: + memory: 128M + reservations: + memory: 128M + volumes: + - ./skills/dff_document_qa_llm_skill:/src + - ./common:/src/common + ports: + - 8186:8186 +proxy: null