diff --git a/pipelines/FAQ.md b/pipelines/FAQ.md index 136a87315111..06f972d0f26a 100644 --- a/pipelines/FAQ.md +++ b/pipelines/FAQ.md @@ -152,3 +152,64 @@ pip install paddlenlp --upgrade ``` pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple ``` + +#### Elastic search 日志显示错误 + +需要编辑config/elasticsearch.yml,在末尾添加: + +``` +ingest.geoip.downloader.enabled: false +``` +如果是Docker启动,请添加如下的配置,然后运行: + +``` +docker run \ + -d \ + --name es02 \ + --net elastic \ + -p 9200:9200 \ + -e discovery.type=single-node \ + -e ES_JAVA_OPTS="-Xms256m -Xmx256m"\ + -e xpack.security.enabled=false \ + -e ingest.geoip.downloader.enabled=false \ + -e cluster.routing.allocation.disk.threshold_enabled=false \ + -it \ + docker.elastic.co/elasticsearch/elasticsearch:8.3.3 +``` + +#### Windows出现运行前端报错`requests.exceptions.MissingSchema: Invalid URL 'None/query': No scheme supplied. Perhaps you meant http://None/query?` + +环境变量没有生效,请检查一下环境变量,确保PIPELINE_YAML_PATH和API_ENDPOINT生效: + +``` +$env:PIPELINE_YAML_PATH='rest_api/pipeline/semantic_search.yaml' + +$env:API_ENDPOINT='http://127.0.0.1:8891' +``` + +#### Windows的GPU运行出现错误:`IndexError: index 4616429690595525704 is out of bounds for axis 0 with size 1` + +paddle.nozero算子出现异常,请退回到PaddlePaddle 2.2.2版本,比如您使用的是cuda 11.2,可以使用如下的命令: + +``` +python -m pip install paddlepaddle-gpu==2.2.2.post112 -f https://www.paddlepaddle.org.cn/whl/windows/mkl/avx/stable.html +``` + +#### 运行应用的时候出现错误 `assert d == self.d` + +这是运行多个应用引起的,请在运行其他应用之前,删除现有的db文件: + +``` +rm -rf faiss_document_store.db +``` + +#### Windows运行应用的时候出现了下面的错误:`RuntimeError: (NotFound) Cannot open file C:\Users\my_name/.paddleocr/whl\det\ch\ch_PP-OCRv3_det_infer/inference.pdmodel, please confirm whether the file is normal.` + +这是Windows系统用户命名为中文的原因,详细解决方法参考issue. [https://github.com/PaddlePaddle/PaddleNLP/issues/3242](https://github.com/PaddlePaddle/PaddleNLP/issues/3242) + +#### 怎样从GPU切换到CPU上运行? + +请在对应的所有`sh`文件里面加入下面的环境变量 +``` +export CUDA_VISIBLE_DEVICES="" +``` diff --git a/pipelines/pipelines/document_stores/base.py b/pipelines/pipelines/document_stores/base.py index 60e277297b37..e5206f4eec49 100644 --- a/pipelines/pipelines/document_stores/base.py +++ b/pipelines/pipelines/document_stores/base.py @@ -31,7 +31,7 @@ from pipelines.nodes.base import BaseComponent from pipelines.errors import DuplicateDocumentError from pipelines.nodes.preprocessor import PreProcessor -from pipelines.document_stores.utils import eval_data_from_json, eval_data_from_jsonl +from pipelines.document_stores.utils import eval_data_from_json, eval_data_from_jsonl, squad_json_to_jsonl logger = logging.getLogger(__name__) diff --git a/pipelines/pipelines/document_stores/utils.py b/pipelines/pipelines/document_stores/utils.py index dd9227c8d3b1..9d0905f21e7c 100644 --- a/pipelines/pipelines/document_stores/utils.py +++ b/pipelines/pipelines/document_stores/utils.py @@ -125,6 +125,21 @@ def eval_data_from_jsonl( yield docs, labels +def squad_json_to_jsonl(squad_file: str, output_file: str): + """ + Converts a SQuAD-json-file into jsonl format with one document per line. + :param squad_file: SQuAD-file in json format. + :param output_file: Name of output file (SQuAD in jsonl format) + """ + with open(squad_file, encoding="utf-8") as json_file, open( + output_file, "w", encoding="utf-8") as jsonl_file: + squad_json = json.load(json_file) + + for doc in squad_json["data"]: + json.dump(doc, jsonl_file) + jsonl_file.write("\n") + + def _extract_docs_and_labels_from_dict(document_dict: Dict, preprocessor: PreProcessor = None, open_domain: bool = False):