feat: ngdi_api initial version

poc passed, now implemented in lib
wey-gu · Mar 17, 2023 · 549ea71 · 549ea71
1 parent dd63777
commit 549ea71
Show file tree

Hide file tree

Showing 12 changed files with 538 additions and 158 deletions.
diff --git a/.gitignore b/.gitignore
@@ -131,3 +131,6 @@ dmypy.json
 
 # pdm
 .pdm.toml
+
+# requirments.txt is only for local development
+requirements.txt
diff --git a/README.md b/README.md
@@ -119,35 +119,61 @@ Basically the same as Spark Engine, but with `engine="nebula"`.
 ngdi is an unified abstraction layer for different engines, the current implementation is based on Spark, NetworkX, DGL and NebulaGraph, but it's easy to extend to other engines like Flink, GraphScope, PyG etc.
 
 ```
-        ┌───────────────────────────────────────────────────┐            
-        │   Spark Cluster                                   │            
-        │    .─────.    .─────.    .─────.    .─────.       │            
-     ┌─▶│   :       ;  :       ;  :       ;  :       ;      │            
-     │  │     `───'      `───'      `───'      `───'        │            
-Algorithm                                                   │            
-  Spark └───────────────────────────────────────────────────┘            
- Engine ┌────────────────────────────────────────────────────────────────┐
-     └──┤                                                                │
-        │   NebulaGraph Data Intelligence Suite(ngdi)                    │
-        │     ┌────────┐    ┌──────┐    ┌────────┐   ┌─────┐             │
-        │     │ Reader │    │ Algo │    │ Writer │   │ GNN │             │
-        │     └────────┘    └──────┘    └────────┘   └─────┘             │
-        │          ├────────────┴───┬────────┴─────┐    └──────┐         │
-        │          ▼                ▼              ▼           ▼         │
-        │   ┌─────────────┐ ┌──────────────┐ ┌──────────┐┌───────────┐   │
-     ┌──┤   │ SparkEngine │ │ NebulaEngine │ │ NetworkX ││ DGLEngine │   │
-     │  │   └─────────────┘ └──────────────┘ └──────────┘└───────────┘   │
-     │  └──────────┬─────────────────────────────────────────────────────┘
-     │             │        Spark                                        
-     │             └────────Reader ────────────┐                         
-Spark Reader              Query Mode           │                         
-Scan Mode                                      ▼                         
-     │  ┌───────────────────────────────────────────────────┐            
-     │  │  NebulaGraph Graph Engine         Nebula-GraphD   │            
-     │  ├──────────────────────────────┬────────────────────┤            
-     │  │  NebulaGraph Storage Engine  │                    │            
-     └─▶│  Nebula-StorageD             │    Nebula-Metad    │            
-        └──────────────────────────────┴────────────────────┘            
+          ┌───────────────────────────────────────────────────┐
+          │   Spark Cluster                                   │
+          │    .─────.    .─────.    .─────.    .─────.       │
+          │   ;       :  ;       :  ;       :  ;       :      │
+       ┌─▶│   :       ;  :       ;  :       ;  :       ;      │
+       │  │    ╲     ╱    ╲     ╱    ╲     ╱    ╲     ╱       │
+       │  │     `───'      `───'      `───'      `───'        │
+  Algo Spark                                                  │
+    Engine└───────────────────────────────────────────────────┘
+       │  ┌────────────────────────────────────────────────────┬──────────┐
+       └──┤                                                    │          │
+          │   NebulaGraph Data Intelligence Suite(ngdi)        │ ngdi-api │◀─┐
+          │                                                    │          │  │
+          │                                                    └──────────┤  │
+          │     ┌────────┐    ┌──────┐    ┌────────┐   ┌─────┐            │  │
+          │     │ Reader │    │ Algo │    │ Writer │   │ GNN │            │  │
+ ┌───────▶│     └────────┘    └──────┘    └────────┘   └─────┘            │  │
+ │        │          │            │            │          │               │  │
+ │        │          ├────────────┴───┬────────┴─────┐    └──────┐        │  │
+ │        │          ▼                ▼              ▼           ▼        │  │
+ │        │   ┌─────────────┐ ┌──────────────┐ ┌──────────┐┌──────────┐   │  │
+ │     ┌──┤   │ SparkEngine │ │ NebulaEngine │ │ NetworkX ││ DGLEngine│   │  │
+ │     │  │   └─────────────┘ └──────────────┘ └──────────┘└──────────┘   │  │
+ │     │  └──────────┬────────────────────────────────────────────────────┘  │
+ │     │             │        Spark                                          │
+ │     │             └────────Reader ────────────┐                           │
+ │  Spark                   Query Mode           │                           │
+ │  Reader                                       │                           │
+ │Scan Mode                                      ▼                      ┌─────────┐
+ │     │  ┌───────────────────────────────────────────────────┬─────────┤ ngdi-udf│◀─────────────┐
+ │     │  │                                                   │         └─────────┤              │
+ │     │  │  NebulaGraph Graph Engine         Nebula-GraphD   │   ngdi-GraphD     │              │
+ │     │  ├──────────────────────────────┬────────────────────┼───────────────────┘              │
+ │     │  │                              │                    │                                  │
+ │     │  │  NebulaGraph Storage Engine  │                    │                                  │
+ │     │  │                              │                    │                                  │
+ │     └─▶│  Nebula-StorageD             │    Nebula-Metad    │                                  │
+ │        │                              │                    │                                  │
+ │        └──────────────────────────────┴────────────────────┘                                  │
+ │                                                                                               │
+ │    ┌───────────────────────────────────────────────────────────────────────────────────────┐  │
+ │    │ RETURN ngdi("pagerank", ["follow"], ["degree"], "spark", {space: "basketballplayer"}) │──┘
+ │    └───────────────────────────────────────────────────────────────────────────────────────┘
+ │  ┌─────────────────────────────────────────────────────────────┐
+ │  │ from ngdi import NebulaReader                               │
+ │  │                                                             │
+ │  │ # read data with spark engine, scan mode                    │
+ │  │ reader = NebulaReader(engine="spark")                       │
+ │  │ reader.scan(edge="follow", props="degree")                  │
+ └──│ df = reader.read()                                          │
+    │                                                             │
+    │ # run pagerank algorithm                                    │
+    │ pr_result = df.algo.pagerank(reset_prob=0.15, max_iter=10)  │
+    │                                                             │
+    └─────────────────────────────────────────────────────────────┘  
 ```
 
 ### Spark Engine Prerequisites

diff --git a/docs/Environment_Setup.md b/docs/Environment_Setup.md
@@ -41,7 +41,7 @@ Just visit [http://localhost:7001](http://localhost:7001) in your browser, with:
 - user: `root`
 - password: `nebula`
 
-## Rin In Production
+## Run In Production
 
 ### Run on PySpark Jupyter Notebook
 
@@ -98,15 +98,45 @@ pr_result = df.algo.pagerank(reset_prob=0.15, max_iter=10)
 Then we can submit the job to Spark cluster:
 
 ```bash
-spark-submit --master spark://master:7077 \
-    --driver-class-path /opt/nebulagraph/ngdi/package/nebula-spark-connector.jar \
-    --driver-class-path /opt/nebulagraph/ngdi/package/nebula-algo.jar \
-    --jars /opt/nebulagraph/ngdi/package/nebula-spark-connector.jar \
-    --jars /opt/nebulagraph/ngdi/package/nebula-algo.jar \
-    --py-files /opt/nebulagraph/ngdi/package/ngdi-py3-env.zip \
+spark-submit --master spark://sparkmaster:7077 \
+    --driver-class-path <hdfs_or_local_path_to>/nebula-spark-connector.jar \
+    --driver-class-path <hdfs_or_local_path_to>/nebula-algo.jar \
+    --jars <hdfs_or_local_path_to>/nebula-spark-connector.jar \
+    --jars <hdfs_or_local_path_to>/nebula-algo.jar \
+    --py-files <hdfs_or_local_path_to>/ngdi-py3-env.zip \
     pagerank.py
 ```
 
+## Prepare for py-files
+
+```bash
+pip install pdm
+# prepare dep list in ngdi codebase
+pdm export -o dist/requirements.txt --without-hashes
+# build a wheel for ngdi
+pdm build
+# output it to dependencies
+pip install -r dist/requirements.txt --target dist/dependencies
+pip install . --target dist/dependencies
+# zip dependencies and ngdi wheel
+cd dist
+zip -r ngdi-py3-env.zip dependencies
+# copy ngdi-py3-env.zip to hdfs
+hdfs dfs -put ngdi-py3-env.zip /
+# check it's there
+hdfs dfs -ls /
+```
+
+Now we have all files ready:
+
+```bash
+# hdfs dfs -ls /
+Found 4 items
+-rw-r--r--   3 root supergroup  167042166 2023-03-17 03:54 /nebula-algo.jar
+-rw-r--r--   3 root supergroup  165992037 2023-03-17 03:54 /nebula-spark-connector.jar
+-rw-r--r--   3 root supergroup    5068513 2023-03-17 03:52 /ngdi-py3-env.zip
+```
+
 ### Run ngdi algorithm PySpark job from python script
 
 We have everything ready as above, including the `pagerank.py`.

diff --git a/docs/ngdi_API_Gateway.md b/docs/ngdi_API_Gateway.md
@@ -1,14 +1,99 @@
 
+# ngdi API Gateway
+
+ngdi API Gateway is a RESTful API server that provides a unified interface for ngdi algorithms.
+
+With ngdi API Gateway and ngdi UDF, we could call ngdi algorithms from ngql.
+
 ## Calling from ngql
 
 ```cypher
-RETURN ngdi("pagerank", ["follow"], ["degree"], "compact")
+RETURN ngdi("pagerank", ["follow"], ["degree"], "spark", {space: "basketballplayer", max_iter: 10}, {write_mode: "insert"})
 ```
 
 ## Setup ngdi API Gateway
 
-See: [../examples/ngdi_from_ngql_udf.ipynb](https://github.com/wey-gu/nebulagraph-di/blob/main/examples/ngdi_from_ngql_udf.ipynb)
+For Spark engine, we could run it from the Spark Juptyer Notebook, see: [../examples/ngdi_from_ngql_udf.ipynb](https://github.com/wey-gu/nebulagraph-di/blob/main/examples/ngdi_from_ngql_udf.ipynb)
+
+For NetworkX engine, we could run it in same way as it was in Jupyter Notebook, see: [../examples/run_ngdi_api.py](https://github.com/wey-gu/nebulagraph-di/blob/main/examples/run_ngdi_api.py)
+
+Or you could call with `pdm`:
+
+```bash
+export NGDI_PORT=9999
+pdm run ngdi-api
+```
 
 ## UDF build
 
 See https://github.com/wey-gu/nebula/tree/ngdi_udf
+
+- Build binary `ngdi.so` file
+
+```bash
+export TAG=ubuntu2004
+docker run -ti \
+  --network nebula-net \
+  --security-opt seccomp=unconfined \
+  -v "$PWD":/home/nebula \
+  -w /home/nebula \
+  --name nebula_dev \
+  vesoft/nebula-dev:$TAG \
+  bash
+
+mkdir build && cd build
+cmake -DCMAKE_CXX_COMPILER=$TOOLSET_CLANG_DIR/bin/g++ -DCMAKE_C_COMPILER=$TOOLSET_CLANG_DIR/bin/gcc -DENABLE_WERROR=OFF -DCMAKE_BUILD_TYPE=Release -DENABLE_TESTING=OFF ..
+
+cd ../udf
+make UDF=ngdi
+```
+
+## Setup ngdi-graphd
+
+The ngdi-graphd is just a graphd with ngdi UDF installed.
+
+We just need to put the `ngdi.so` file into one path of graphd like `/udf/`, and then set the `--udf_path` to this path together with `--enable_udf=true`.
+
+- Note that the `ngdi.so` file should be built in the same environment as the graphd.
+- The ngdi.so should be granted the `x` permission. (`chmod +x ngdi.so`)
+- The ngdi-api's url should be set in the `ngdi_gateway_url_prefix` environment variable. i.e. `export ngdi_gateway_url_prefix=http://jupyter:9999"`.
+
+Example docker compose:
+
+```yaml
+  graphd:
+    image: weygu/ngdi-graphd:2023.03.13
+    container_name: ngdi_graphd
+    environment:
+      USER: root
+      TZ:   "${TZ:-Asia/Shanghai}"
+      ngdi_gateway_url_prefix: "http://jupyter:9999"
+    command:
+      - --meta_server_addrs=metad0:9559,metad1:9559,metad2:9559
+      - --port=9669
+      - --local_ip=ngdi_graphd
+      - --ws_ip=ngdi_graphd
+      - --ws_http_port=19669
+      - --log_dir=/logs
+      - --v=5
+      - --enable_udf=true
+      - --udf_path=/udf/
+    healthcheck:
+      test: ["CMD", "curl", "-sf", "http://ngdi_graphd:19669/status"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 20s
+    ports:
+      - "29669:9669"
+      - 19669
+      - 19670
+    volumes:
+      - ./logs/graph:/logs
+      - ./udf:/udf
+    networks:
+      - nebula-net
+    restart: on-failure
+    cap_add:
+      - SYS_PTRACE
+```