Add FAQ and missing json output files (#3298)

PaddlePaddle · Sep 20, 2022 · 8a3a1aa · 8a3a1aa
1 parent 77b65d3
commit 8a3a1aa
Show file tree

Hide file tree

Showing 3 changed files with 77 additions and 1 deletion.
diff --git a/pipelines/FAQ.md b/pipelines/FAQ.md
@@ -152,3 +152,64 @@ pip install paddlenlp --upgrade
 ```
 pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
 ```
+
+#### Elastic search 日志显示错误
+
+需要编辑config/elasticsearch.yml，在末尾添加：
+
+```
+ingest.geoip.downloader.enabled: false
+```
+如果是Docker启动，请添加如下的配置，然后运行：
+
+```
+docker run \
+      -d \
+      --name es02 \
+      --net elastic \
+      -p 9200:9200 \
+      -e discovery.type=single-node \
+      -e ES_JAVA_OPTS="-Xms256m -Xmx256m"\
+      -e xpack.security.enabled=false \
+      -e  ingest.geoip.downloader.enabled=false \
+      -e cluster.routing.allocation.disk.threshold_enabled=false \
+      -it \
+      docker.elastic.co/elasticsearch/elasticsearch:8.3.3
+```
+
+#### Windows出现运行前端报错`requests.exceptions.MissingSchema: Invalid URL 'None/query': No scheme supplied. Perhaps you meant http://None/query?`
+
+环境变量没有生效，请检查一下环境变量，确保PIPELINE_YAML_PATH和API_ENDPOINT生效：
+
+```
+$env:PIPELINE_YAML_PATH='rest_api/pipeline/semantic_search.yaml'
+
+$env:API_ENDPOINT='http://127.0.0.1:8891'
+```
+
+#### Windows的GPU运行出现错误：`IndexError: index 4616429690595525704 is out of bounds for axis 0 with size 1`
+
+paddle.nozero算子出现异常，请退回到PaddlePaddle 2.2.2版本，比如您使用的是cuda 11.2，可以使用如下的命令：
+
+```
+python -m pip install paddlepaddle-gpu==2.2.2.post112 -f https://www.paddlepaddle.org.cn/whl/windows/mkl/avx/stable.html
+```
+
+#### 运行应用的时候出现错误 `assert d == self.d`
+
+这是运行多个应用引起的，请在运行其他应用之前，删除现有的db文件：
+
+```
+rm -rf faiss_document_store.db
+```
+
+#### Windows运行应用的时候出现了下面的错误：`RuntimeError: (NotFound) Cannot open file C:\Users\my_name/.paddleocr/whl\det\ch\ch_PP-OCRv3_det_infer/inference.pdmodel, please confirm whether the file is normal.`
+
+这是Windows系统用户命名为中文的原因，详细解决方法参考issue. [https://github.com/PaddlePaddle/PaddleNLP/issues/3242](https://github.com/PaddlePaddle/PaddleNLP/issues/3242)
+
+#### 怎样从GPU切换到CPU上运行？
+
+请在对应的所有`sh`文件里面加入下面的环境变量
+```
+export CUDA_VISIBLE_DEVICES=""
+```
diff --git a/pipelines/pipelines/document_stores/base.py b/pipelines/pipelines/document_stores/base.py
@@ -31,7 +31,7 @@
 from pipelines.nodes.base import BaseComponent
 from pipelines.errors import DuplicateDocumentError
 from pipelines.nodes.preprocessor import PreProcessor
-from pipelines.document_stores.utils import eval_data_from_json, eval_data_from_jsonl
+from pipelines.document_stores.utils import eval_data_from_json, eval_data_from_jsonl, squad_json_to_jsonl
 
 logger = logging.getLogger(__name__)
 

diff --git a/pipelines/pipelines/document_stores/utils.py b/pipelines/pipelines/document_stores/utils.py
@@ -125,6 +125,21 @@ def eval_data_from_jsonl(
     yield docs, labels
 
 
+def squad_json_to_jsonl(squad_file: str, output_file: str):
+    """
+    Converts a SQuAD-json-file into jsonl format with one document per line.
+    :param squad_file: SQuAD-file in json format.
+    :param output_file: Name of output file (SQuAD in jsonl format)
+    """
+    with open(squad_file, encoding="utf-8") as json_file, open(
+            output_file, "w", encoding="utf-8") as jsonl_file:
+        squad_json = json.load(json_file)
+
+        for doc in squad_json["data"]:
+            json.dump(doc, jsonl_file)
+            jsonl_file.write("\n")
+
+
 def _extract_docs_and_labels_from_dict(document_dict: Dict,
                                        preprocessor: PreProcessor = None,
                                        open_domain: bool = False):