Merge branch 'release/v8.7.0'

duydo · May 8, 2023 · 2d9288b · 2d9288b
2 parents 092829c + 9de1e09
commit 2d9288b
Show file tree

Hide file tree

Showing 14 changed files with 233 additions and 229 deletions.
diff --git a/.env.sample b/.env.sample
@@ -1 +1,2 @@
-ES_VERSION=7.5.1
+ES_VERSION=8.7.0
+ELASTIC_PASSWORD=changeme
diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -8,7 +8,6 @@ jobs:
     strategy:
       matrix:
         entry:
-          - { version: 11, distribution: 'adopt' }
           - { version: 17, distribution: 'adopt' }
     steps:
       - name: Checkout analysis-vietnamese 
@@ -33,4 +32,4 @@ jobs:
       - name: Build and Test
         run: |
           export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
-          mvn --batch-mode test
+          mvn --batch-mode test
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,44 @@
+ARG ES_VERSION
+FROM docker.elastic.co/elasticsearch/elasticsearch:$ES_VERSION as builder
+
+USER root
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update -y && apt-get install -y software-properties-common build-essential
+RUN gcc --version
+RUN apt-get update -y && apt-get install -y make cmake pkg-config wget git
+
+ENV JAVA_HOME=/usr/share/elasticsearch/jdk
+ENV PATH=$JAVA_HOME/bin:$PATH
+
+# Build coccoc-tokenizer
+RUN echo "Build coccoc-tokenizer..."
+WORKDIR /tmp
+RUN git clone https://github.com/duydo/coccoc-tokenizer.git
+RUN mkdir /tmp/coccoc-tokenizer/build
+WORKDIR /tmp/coccoc-tokenizer/build
+RUN cmake -DBUILD_JAVA=1 ..
+RUN make install
+
+# Build analysis-vietnamese
+RUN echo "analysis-vietnamese..."
+WORKDIR /tmp
+RUN wget https://dlcdn.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz \
+    && tar xvf apache-maven-3.8.8-bin.tar.gz
+ENV MVN_HOME=/tmp/apache-maven-3.8.8
+ENV PATH=$MVN_HOME/bin:$PATH
+
+COPY . /tmp/elasticsearch-analysis-vietnamese
+WORKDIR /tmp/elasticsearch-analysis-vietnamese
+RUN mvn verify clean --fail-never
+RUN mvn --batch-mode -Dmaven.test.skip -e package
+
+FROM docker.elastic.co/elasticsearch/elasticsearch:$ES_VERSION
+ARG ES_VERSION
+ARG COCCOC_INSTALL_PATH=/usr/local
+ARG COCCOC_DICT_PATH=$COCCOC_INSTALL_PATH/share/tokenizer/dicts
+
+COPY --from=builder $COCCOC_INSTALL_PATH/lib/libcoccoc_tokenizer_jni.so /usr/lib
+COPY --from=builder $COCCOC_DICT_PATH $COCCOC_DICT_PATH
+COPY --from=builder /tmp/elasticsearch-analysis-vietnamese/target/releases/elasticsearch-analysis-vietnamese-$ES_VERSION.zip /
+RUN echo "Y" | /usr/share/elasticsearch/bin/elasticsearch-plugin install --batch file:///elasticsearch-analysis-vietnamese-$ES_VERSION.zip
diff --git a/README.md b/README.md
@@ -108,20 +108,52 @@ The above example produces the following terms:
 
 ```
 
+## Use Docker
+
+Make sure you have installed both Docker & docker-compose
+
+### Build the image with Docker Compose
+
+```sh
+# Copy, edit ES version and password for user elastic in file .env. Default password: changeme
+cp .env.sample .env
+docker compose build
+docker compose up
+```
+### Verify
+```sh
+curl -k http://elastic:changeme@localhost:9200/_analyze -H 'Content-Type: application/json' -d '
+{
+  "analyzer": "vi_analyzer",
+  "text": "Cộng hòa Xã hội chủ nghĩa Việt Nam"
+}'
+
+# Output
+{"tokens":[{"token":"cộng hòa","start_offset":0,"end_offset":8,"type":"<WORD>","position":0},{"token":"xã hội","start_offset":9,"end_offset":15,"type":"<WORD>","position":1},{"token":"chủ nghĩa","start_offset":16,"end_offset":25,"type":"<WORD>","position":2},{"token":"việt nam","start_offset":26,"end_offset":34,"type":"<WORD>","position":3}]}                                                                                     
+```
+
 ## Build from Source
 ### Step 1: Build C++ tokenizer for Vietnamese library
 ```sh
-git clone https://github.com/coccoc/coccoc-tokenizer.git
+git clone https://github.com/duydo/coccoc-tokenizer.git
 cd coccoc-tokenizer && mkdir build && cd build
 cmake -DBUILD_JAVA=1 ..
 make install
+# Link the coccoc shared lib to /usr/lib
+sudo ln -sf /usr/local/lib/libcoccoc_tokenizer_jni.* /usr/lib/
 ```
 By default, the `make install` installs:
-- the lib commands (`tokenizer`, `dict_compiler` and `vn_lang_tool`) under `/usr/local/bin`
-- the dynamic lib (`libcoccoc_tokenizer_jni.so`) under `/usr/local/lib/`. The plugin uses this lib directly.
-- the dictionary files under `/usr/local/share/tokenizer/dicts`. The plugin uses this path for `dict_path` by default.
+- The lib commands `tokenizer`, `dict_compiler` and `vn_lang_tool` under `/usr/local/bin`
+- The dynamic lib `libcoccoc_tokenizer_jni.so` under `/usr/local/lib/`. The plugin uses this lib directly.
+- The dictionary files under `/usr/local/share/tokenizer/dicts`. The plugin uses this path for `dict_path` by default.
+
+Verify
+```sh
+/usr/local/bin/tokenizer "Cộng hòa Xã hội chủ nghĩa Việt Nam"
+# cộng hòa	xã hội	chủ nghĩa	việt nam
+```
 
-Refer [the repo](https://github.com/coccoc/coccoc-tokenizer) for more information to build the library.
+Refer [the repo](https://github.com/duydo/coccoc-tokenizer) for more information to build the library.
 
 
 ### Step 2: Build the plugin
@@ -136,7 +168,7 @@ Optionally, edit the `elasticsearch-analysis-vietnamese/pom.xml` to change the v
 
 ```xml
 ...
-<version>7.17.1</version>
+<version>8.7.0</version>
 ...
  ```
 
@@ -149,16 +181,19 @@ mvn package
 ### Step 3: Installation the plugin on Elasticsearch
 
 ```sh
-bin/elasticsearch-plugin install file://target/releases/elasticsearch-analysis-vietnamese-7.17.1.zip
+bin/elasticsearch-plugin install file://target/releases/elasticsearch-analysis-vietnamese-8.7.0.zip
 ```
 
 ## Compatible Versions
 From v7.12.11, the plugin uses CocCoc C++ tokenizer instead of the VnTokenizer by Lê Hồng Phương,
 I don't maintain the plugin with the VnTokenizer anymore, if you want to continue developing with it, refer [the branch vntokenizer](https://github.com/duydo/elasticsearch-analysis-vietnamese/tree/vntokenizer).  
 
 | Vietnamese Analysis Plugin | Elasticsearch   |
-| -------------------------- |-----------------|
-| master                     | 7.16 ~ 7.17.1   |
+|----------------------------|-----------------|
+| master                     | 8.7.0           |
+| develop                    | 8.7.0           |
+| 8.7.0                      | 8.7.0           |
+| 7.16.1                     | 7.16 ~ 7.17.1   |
 | 7.12.1                     | 7.12.1 ~ 7.15.x |     
 | 7.3.1                      | 7.3.1           |   
 | 5.6.5                      | 5.6.5           |

diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -1,15 +1,31 @@
 version: '3.4'
+
 services:
   elasticsearch:
-    image: docker.elastic.co/elasticsearch/elasticsearch:${ES_VERSION}
+    build:
+      context: .
+      args:
+        ES_VERSION: ${ES_VERSION}
     restart: on-failure
     ports:
       - "9200:9200"
-    volumes:
-      - ./target/releases/elasticsearch-analysis-vietnamese-${ES_VERSION}.zip:/usr/share/elasticsearch/plugin/elasticsearch-analysis-vietnamese-${ES_VERSION}.zip
-      - ./install-es-plugin.sh:/apps/install-es-plugin.sh
+    ulimits:
+      nofile:
+        soft: 65536
+        hard: 65536
+      memlock:
+        hard: -1
+        soft: -1
     environment:
-      - "ES_VERSION=${ES_VERSION}"
-      - "discovery.type=single-node"
-    entrypoint:
-      - /apps/install-es-plugin.sh
+      ES_JAVA_OPTS: "-Xmx2g -Xms2g"
+      ELASTIC_USERNAME: "elastic"
+      ELASTIC_PASSWORD: ${ELASTIC_PASSWORD}
+      bootstrap.memory_lock: "true"
+      discovery.type: "single-node"
+      xpack.security.enabled: "true"
+    networks:
+      - elastic
+
+networks:
+  elastic:
+    driver: bridge
diff --git a/install-es-plugin.sh b/install-es-plugin.sh
diff --git a/pom.xml b/pom.xml
@@ -3,7 +3,7 @@
     <modelVersion>4.0.0</modelVersion>
     <groupId>org.elasticsearch</groupId>
     <artifactId>elasticsearch-analysis-vietnamese</artifactId>
-    <version>7.17.8</version>
+    <version>8.7.0</version>
     <packaging>jar</packaging>
     <name>elasticsearch-analysis-vietnamese</name>
     <url>https://github.com/duydo/elasticsearch-analysis-vietnamese/</url>

diff --git a/src/main/java/com/coccoc/Tokenizer.java b/src/main/java/com/coccoc/Tokenizer.java
@@ -10,97 +10,102 @@
  * @author duydo, CocCoc team
  */
 public class Tokenizer {
-    public static final String TOKENIZER_SHARED_LIB_NAME = "coccoc_tokenizer_jni";
 
-    static {
-        System.loadLibrary(TOKENIZER_SHARED_LIB_NAME);
-    }
+  public static final String TOKENIZER_SHARED_LIB_NAME = "coccoc_tokenizer_jni";
+
+  static {
+    System.loadLibrary(TOKENIZER_SHARED_LIB_NAME);
+  }
 
 
-    public enum TokenizeOption {
-        NORMAL(0),
-        HOST(1),
-        URL(2);
+  public enum TokenizeOption {
+    NORMAL(0),
+    HOST(1),
+    URL(2);
 
-        private final int value;
+    private final int value;
 
-        TokenizeOption(int value) {
-            this.value = value;
-        }
+    TokenizeOption(int value) {
+      this.value = value;
+    }
 
-        public int value() {
-            return value;
-        }
+    public int value() {
+      return value;
     }
+  }
 
-    public static final String SPACE = " ";
-    public static final String UNDERSCORE = "_";
-    public static final String COMMA = ",";
-    public static final String DOT = ".";
+  public static final String SPACE = " ";
+  public static final String UNDERSCORE = "_";
+  public static final String COMMA = ",";
+  public static final String DOT = ".";
 
 
-    private static String dictPath = null;
+  private static String dictPath = null;
 
-    private static final class Loader {
-        private static final Tokenizer INSTANCE = get();
+  private static final class Loader {
 
-        private Loader() {
-        }
+    private static final Tokenizer INSTANCE = get();
 
-        private static Tokenizer get() {
-            return new Tokenizer(dictPath);
-        }
+    private Loader() {
     }
 
-    public static Tokenizer getInstance(String dictPath) {
-        Tokenizer.dictPath = dictPath;
-        return Loader.INSTANCE;
+    private static Tokenizer get() {
+      return new Tokenizer(dictPath);
     }
+  }
+
+  public static Tokenizer getInstance(String dictPath) {
+    Tokenizer.dictPath = dictPath;
+    return Loader.INSTANCE;
+  }
 
-    private Tokenizer(String dictPath) {
-        int status = initialize(dictPath);
-        if (0 > status) {
-            throw new RuntimeException(String.format("Cannot initialize Tokenizer: %s", dictPath));
-        }
+  private Tokenizer(String dictPath) {
+    int status = initialize(dictPath);
+    if (0 > status) {
+      throw new RuntimeException(String.format("Cannot initialize Tokenizer: %s", dictPath));
+    }
+  }
+
+  public List<Token> segment(String text, TokenizeOption option, boolean keepPunctuation) {
+    if (text == null) {
+      throw new IllegalArgumentException("text is null");
+    }
+    long resPointer = segmentPointer(text, false, option.value(), keepPunctuation);
+    if (resPointer < 0) {
+      throw new RuntimeException("Cannot segment the text");
     }
 
-    public List<Token> segment(String text, TokenizeOption option, boolean keepPunctuation) {
-        if (text == null) {
-            throw new IllegalArgumentException("text is null");
-        }
-        long resPointer = segmentPointer(text, false, option.value(), keepPunctuation);
-
-        final List<Token> tokens = new ArrayList<>();
-        // Positions from JNI implementation .cpp file
-        int rangesSize = (int) Unsafe.UNSAFE.getLong(resPointer + 8 * 2);
-        long rangesDataPointer = Unsafe.UNSAFE.getLong(resPointer + 8 * 3);
-        int tokenSize = 4 * 6;
-        for (int i = 0, spacePos = 0; i < rangesSize; ++i) {
-            // Positions of UNSAFE values are calculated from {struct Token} in tokenizer.hpp
-            int originalStartPos = Unsafe.UNSAFE.getInt(rangesDataPointer + i * tokenSize + 8);
-            int originalEndPos = Unsafe.UNSAFE.getInt(rangesDataPointer + i * tokenSize + 12);
-            int type = Unsafe.UNSAFE.getInt(rangesDataPointer + i * tokenSize + 16);
-            int segType = Unsafe.UNSAFE.getInt(rangesDataPointer + i * tokenSize + 20);
-
-            // Build substring from UNSAFE array of codepoints
-            final StringBuilder sb = new StringBuilder();
-            for (int j = originalStartPos; j < originalEndPos; ++j) {
-                sb.appendCodePoint(text.charAt(j));
-            }
-            tokens.add(new Token(segType == 1 ? sb.toString().replace(COMMA, DOT) : sb.toString(),
-                    Token.Type.fromInt(type), Token.SegType.fromInt(segType), originalStartPos, originalEndPos));
-        }
-        freeMemory(resPointer);
-        return tokens;
+    final List<Token> tokens = new ArrayList<>();
+    // Positions from JNI implementation .cpp file
+    int rangesSize = Unsafe.getInt(resPointer + 8 * 2);
+    long rangesDataPointer = Unsafe.getLong(resPointer + 8 * 3);
+    int tokenSize = 4 * 6;
+    for (int i = 0; i < rangesSize; ++i) {
+      // Positions of UNSAFE values are calculated from {struct Token} in tokenizer.hpp
+      int originalStartPos = Unsafe.getInt(rangesDataPointer + i * tokenSize + 8);
+      int originalEndPos = Unsafe.getInt(rangesDataPointer + i * tokenSize + 12);
+      int type = Unsafe.getInt(rangesDataPointer + i * tokenSize + 16);
+      int segType = Unsafe.getInt(rangesDataPointer + i * tokenSize + 20);
+
+      // Build substring from UNSAFE array of codepoints
+      final StringBuilder sb = new StringBuilder();
+      for (int j = originalStartPos; j < originalEndPos; ++j) {
+        sb.appendCodePoint(text.charAt(j));
+      }
+      tokens.add(new Token(segType == 1 ? sb.toString().replace(COMMA, DOT) : sb.toString(),
+          Token.Type.fromInt(type), Token.SegType.fromInt(segType), originalStartPos, originalEndPos));
     }
+    freeMemory(resPointer);
+    return tokens;
+  }
 
 
-    //Calls CocCoc lib's segmentPointer function
-    public native long segmentPointer(String text, boolean forTransforming, int tokenizeOption, boolean keepPunctuation);
+  //Calls CocCoc lib's segmentPointer function
+  public native long segmentPointer(String text, boolean forTransforming, int tokenizeOption, boolean keepPunctuation);
 
-    //Calls CocCoc lib's freeMemory function
-    private native void freeMemory(long resPointer);
+  //Calls CocCoc lib's freeMemory function
+  private native void freeMemory(long resPointer);
 
-    //Calls CocCoc lib's initialize function
-    private native int initialize(String dictPath);
+  //Calls CocCoc lib's initialize function
+  private native int initialize(String dictPath);
 }