Merge pull request #1 from Dankoy/batch

Updated readme
Dankoy · Jul 6, 2024 · 89e566e · 89e566e
2 parents 37ddd8b + 8e7f6b0
commit 89e566e
Show file tree

Hide file tree

Showing 9 changed files with 206 additions and 65 deletions.
diff --git a/.github/workflows/dependencies_check.yml b/.github/workflows/dependencies_check.yml
@@ -5,11 +5,11 @@
 # This workflow will build a Java project with Gradle and cache/restore any dependencies to improve the workflow execution time
 # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-java-with-gradle
 
-name: Gradle CI clean build
+name: Maven CI clean build
 
 on:
   pull_request:
-    branches: [ "dependencies", "main", "dev" ]
+    branches: [ "master", "dev" ]
 
 permissions:
   contents: read
@@ -27,11 +27,5 @@ jobs:
         java-version: '21'
         distribution: 'temurin'
 
-    - name: gradle validation
-      uses: gradle/actions/wrapper-validation@v3
-
-    - name: Setup Gradle
-      uses: gradle/actions/setup-gradle@v3
-
-    - name: Run the tests
-      run: ./gradlew test
+    - name: Run the Maven verify phase
+      run: mvn --batch-mode --update-snapshots verify
diff --git a/.github/workflows/gradle.yml → .github/workflows/maven.yml b/.github/workflows/gradle.yml → .github/workflows/maven.yml
@@ -5,11 +5,11 @@
 # This workflow will build a Java project with Gradle and cache/restore any dependencies to improve the workflow execution time
 # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-java-with-gradle
 
-name: Java CI with Gradle
+name: Java CI with Maven
 
 on:
   push:
-    branches: [ "main" ]
+    branches: [ "master" ]
 
 permissions:
   contents: read
@@ -28,21 +28,8 @@ jobs:
         java-version: '21'
         distribution: 'temurin'
 
-    - name: gradle validation
-      uses: gradle/actions/wrapper-validation@v3
+    - name: Run the Maven verify phase
+      run: mvn --batch-mode --update-snapshots verify
 
-    - name: Setup Gradle
-      uses: gradle/actions/setup-gradle@v3
-
-    - name: Run the tests
-      run: ./gradlew test
-
-    - name: Run build
-      run: ./gradlew bootJar
-
-    - name: Save result jar for further use
-      uses: actions/upload-artifact@v4
-      with:
-       name: app.jar
-       path: build/libs/*
-       retention-days: 5
+    - name: Build
+      run: mvn package
diff --git a/.github/workflows/rebase_dependencies.yml b/.github/workflows/rebase_dependencies.yml
diff --git a/.github/workflows/super-linter-full.yml b/.github/workflows/super-linter-full.yml
@@ -8,7 +8,7 @@ name: Full Lint Code Base
 
 on:
   push:
-    branches: [ "main" ]
+    branches: [ "master" ]
 
 permissions: read-all    
 
@@ -40,5 +40,5 @@ jobs:
           VALIDATE_SQL: false
           VALIDATE_SQLFLUFF: false
           FILTER_REGEX_EXCLUDE: "gradlew"
-          DEFAULT_BRANCH: main
+          DEFAULT_BRANCH: master
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/super-linter.yml b/.github/workflows/super-linter.yml
@@ -8,7 +8,7 @@ name: Lint Code Base
 
 on:
   pull_request:
-    branches: [ "main" ]
+    branches: [ "master" ]
 
 permissions: read-all    
 
@@ -40,5 +40,5 @@ jobs:
           VALIDATE_SQL: false
           VALIDATE_SQLFLUFF: false
           FILTER_REGEX_EXCLUDE: "gradlew"
-          DEFAULT_BRANCH: main
+          DEFAULT_BRANCH: master
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/README.md b/README.md
@@ -1,2 +1,166 @@
 # korvo-to-anki-lemmatizer
-Lemmatizer for korvo-to-anki
+
+[![GitHub Release](https://img.shields.io/github/v/release/dankoy/korvo-to-anki-lemmatizer)](https://github.com/Dankoy/korvo-to-anki-lemmatizer/releases/latest)
+![GitHub Release Date](https://img.shields.io/github/release-date/dankoy/korvo-to-anki-lemmatizer)
+![GitHub Downloads (all assets, all releases)](https://img.shields.io/github/downloads/dankoy/korvo-to-anki-lemmatizer/total)
+
+[![Maven CI with Gradle](https://github.com/Dankoy/korvo-to-anki-lemmatizer/actions/workflows/gradle.yml/badge.svg?branch=master)](https://github.com/Dankoy/korvo-to-anki-lemmatizer/actions/workflows/maven.yml)
+
+[![Hits](https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgit.luolix.top%2FDankoy%2Fkorvo-to-anki-lemmatizer&count_bg=%2379C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=hits&edge_flat=false)](https://hits.seeyoufarm.com)
+![GitHub search hit counter](https://img.shields.io/github/search/dankoy/korvo-to-anki-lemmatizer/korvo-to-anki-lemmatizer)
+
+Lemmatizer for [korvo-to-anki](https://github.com/Dankoy/korvo-to-anki).
+
+Uses Stanford CoreNLP lib for lemmatization https://stanfordnlp.github.io/CoreNLP/lemma.html.
+
+# Purpose
+
+If you like me translated words as is in KOReader, then you have many words with like 'was' or '
+maintained'. Also such words can be duplicated by lemma of such words (vocabulary_builder plugin
+never checks that). But anki cards has to have lemmas of such words. For example lemma for word '
+was' is 'be' and lemma for word 'maintained' is 'maintain'.
+
+So this app checks for such words and rewrites it in lemmas.
+
+Lemmatization works only for strings containing one word or hyphen.
+
+# Usage
+
+Functionality:
+
+1) Check duplicated words and delete them in db
+2) Check for words that already has lemmas in db and delete them from db.
+3) Get lemmas from words and update them in db
+
+To correctly use this project it's necessary to keep in mind that backups are absolutely necessary,
+and automatic operations for duplicates removal should be done on your own risk. You can check
+duplicates before deleting them manually or automatically. Application never tries to figure out
+which word is better to be kept and which has to be deleted. It just deletes randomly if you have
+multiple duplicates.
+
+## More on duplicates
+
+Words like 'maintained', 'maintains' are considered to be duplicates because they both have lemma
+as 'maintain'. Application will take one of them and print, so you can check and manually, or you
+can run command to delete one of the word automatically.
+
+Also you can have words like 'maintain' and 'maintained'. So you have already lemma in db, and that
+means, you don't need to find lemma for word 'maintained'. Application will take such words, print
+them, so you can check and delete manually, or you can run command to delete them automatically.
+
+## Run 
+
+#### On linux
+
+`java -jar -Dspring.datasource.url=jdbc:sqlite:/path/to/vocabulary_builder.sqlite3 korvo-to-anki-lemmatizer.jar `
+
+#### On windows
+
+`java "-Dspring.datasource.url=jdbc:sqlite:\path\to\vocabulary_builder.sqlite3" -jar .\korvo-to-anki-lemmatizer.jar`
+
+
+## Commands
+
+```
+lemmatize
+       * lemmatize-all-vocabularies, lav: Lemmatize vocabularies. Should be used only if check-on-duplicates command returns empty duplicates
+       check-existing-lemmas-if-exists, celie: Check if db contains lemmas that could be ignored
+       * auto-delete-duplicates, add: Automatically delete duplicates. Do on your own risk.
+       check-on-duplicates, cod: Check lemmas on duplicates
+       * auto-delete-words-lemmas-exists, adwle: Automatically delete words that already has lemmas in db for. Do on your own risk.
+```
+
+## 1. Check on exising lemmas
+
+First check on already existing lemmas in db.
+
+```shell
+shell:>celie
+17:57:38.607 [main] INFO  com.zaxxer.hikari.HikariDataSource - HikariPool-1 - Starting...
+17:57:38.949 [main] INFO  com.zaxxer.hikari.pool.HikariPool - HikariPool-1 - Added connection org.sqlite.jdbc4.JDBC4Connection@113dcaf8
+17:57:38.951 [main] INFO  com.zaxxer.hikari.HikariDataSource - HikariPool-1 - Start completed.
+[ 
+{
+  "word" : "wretches",
+  "lemma" : "wretch",
+  "title" : "The Sorrows of Satan (Horror Classic)"
+}, {
+  "word" : "keenest",
+  "lemma" : "keen",
+  "title" : "Orcs"
+} 
+]
+```
+
+If the list is not empty then delete duplicates manually or go to next step.
+
+## 2. Automatically remove words for which lemmas are present
+
+This action cannot be undone.
+
+```shell
+shell:>adwle
+18:02:05.785 [main] INFO  r.d.k.c.d.v.v.VocabularyDaoJdbc - Batch size: 100
+...
+```
+
+After deleting such words go to step 3
+
+## 3. Check on duplicates
+
+```shell
+shell:>cod
+18:03:52.271 [main] INFO  com.zaxxer.hikari.HikariDataSource - HikariPool-1 - Starting...
+18:03:52.486 [main] INFO  com.zaxxer.hikari.pool.HikariPool - HikariPool-1 - Added connection org.sqlite.jdbc4.JDBC4Connection@39f0c343
+18:03:52.488 [main] INFO  com.zaxxer.hikari.HikariDataSource - HikariPool-1 - Start completed.
+[ {
+  "word" : "lurking",
+  "lemma" : "lurk",
+  "title" : "The Sorrows of Satan (Horror Classic)"
+}, {
+  "word" : "Drapes",
+  "lemma" : "drape",
+  "title" : "Orcs"
+}
+]
+```
+
+If empty go to step 5, if not delete manually or go to step 4
+
+## 4. Automatically delete duplicates
+
+```shell
+shell:>add
+18:03:55.322 [main] INFO  r.d.k.c.d.v.v.VocabularyDaoJdbc - Batch size: 53
+...
+
+```
+
+After deleting got to step 5
+
+## 5. Lemmatize your db
+
+```shell
+shell:>lav
+18:04:19.608 [main] INFO  r.d.k.c.d.v.v.VocabularyDaoJdbc - Batch size: 100
+18:04:19.608 [main] INFO  r.d.k.c.d.v.v.VocabularyDaoJdbc - Batch: [VocabularyLemmaFullDTO[word=dripping, lemma=drip, title=Title[id=1, name=Orcs, filter=1], createTime=1658041927, reviewTime=0, dueTime=1658042227, reviewCount=0, prevContext=null, nextContext=null, streakCount=0], VocabularyLemmaFullDTO[word=marshlands, lemma=marshland, 
+...
+```
+
+## Mentions
+
+1) You can't run lemmatizing command if duplicates of any kind exists.
+
+```shell
+shell:>lav
+Command 'lemmatize-all-vocabularies' exists but is not currently available because first check and fix all duplicates
+Details of the error have been omitted. You can use the stacktrace command to print the full stacktrace.
+```
+
+## 6. Move vocabulary builder db
+
+Check if lemmas satisfy you, make another backup of old database.
+Move updated database into your reader, check if everything works.
+Run [korvo-to-anki](https://github.com/Dankoy/korvo-to-anki) app again if necessary.
+
+
diff --git a/pom.xml b/pom.xml
@@ -13,16 +13,30 @@
 
   <groupId>ru.dankoy</groupId>
   <artifactId>korvo-to-anki-lemmatizer</artifactId>
-  <version>0.1.0-SNAPSHOT</version>
+  <version>0.1.0</version>
   <name>korvo-to-anki-lemmatizer</name>
   <description>korvo-to-anki-lemmatizer</description>
   <packaging>jar</packaging>
 
   <licenses>
-    <license/>
+    <license>
+      <name>BSD-3-Clause</name>
+      <url>https://spdx.org/licenses/BSD-3-Clause.html</url>
+      <distribution>repo</distribution>
+    </license>
   </licenses>
   <developers>
-    <developer/>
+    <developer>
+      <id>Dankoy</id>
+      <name>Dankoy</name>
+      <email>-</email>
+      <url>https://github.com/Dankoy</url>
+      <roles>
+        <role>maintainer</role>
+        <role>developer</role>
+      </roles>
+      <timezone>Europe/Moscow</timezone>
+    </developer>
   </developers>
 
   <properties>

diff --git a/src/main/java/ru/dankoy/korvotoanki/core/command/LemmatizerCommand.java b/src/main/java/ru/dankoy/korvotoanki/core/command/LemmatizerCommand.java
@@ -22,7 +22,8 @@
 @Command(group = "lemmatize")
 public class LemmatizerCommand {
 
-  private boolean lemmatizeAvailable = false;
+  private boolean duplicatesCleaned = false;
+  private boolean duplicatesForExistingLemmasCleaned = false;
   private List<VocabularyLemmaDTO> duplicates = new ArrayList<>();
   private List<VocabularyLemmaDTO> alreadyExistingLemmas = new ArrayList<>();
 
@@ -47,7 +48,7 @@ public String checkOnDuplicates() {
     duplicates = dtos.stream().filter(n -> !elements.add(n)).toList();
 
     if (duplicates.isEmpty()) {
-      lemmatizeAvailable = true;
+      duplicatesCleaned = true;
     }
 
     return objectMapperService.convertToStringPrettyPrint(duplicates);
@@ -87,6 +88,10 @@ public String checkExistingLemmas() {
                             s -> s.word().equals(dto.lemma()))) // filter if lemma equals word in db
             .toList();
 
+    if (alreadyExistingLemmas.isEmpty()) {
+      duplicatesForExistingLemmasCleaned = true;
+    }
+
     return objectMapperService.convertToStringPrettyPrint(alreadyExistingLemmas);
   }
 
@@ -123,7 +128,7 @@ public String lemmatizeAllVocabularies() {
   @Bean
   public AvailabilityProvider lemmatizeAvailability() {
     return () ->
-        lemmatizeAvailable
+        (duplicatesCleaned && duplicatesForExistingLemmasCleaned)
             ? Availability.available()
             : Availability.unavailable("first check and fix all duplicates");
   }
@@ -141,6 +146,6 @@ public AvailabilityProvider lemmasExists() {
     return () ->
         !alreadyExistingLemmas.isEmpty()
             ? Availability.available()
-            : Availability.unavailable("some lemmas already exists, fix it");
+            : Availability.unavailable("no duplicates");
   }
 }
diff --git a/...n/java/ru/dankoy/korvotoanki/core/dao/vocabularybuilder/vocabulary/VocabularyDaoJdbc.java b/...n/java/ru/dankoy/korvotoanki/core/dao/vocabularybuilder/vocabulary/VocabularyDaoJdbc.java
@@ -25,7 +25,7 @@
 @RequiredArgsConstructor
 public class VocabularyDaoJdbc implements VocabularyDao {
 
-  private static final int BATCH_SIZE = 5;
+  private static final int BATCH_SIZE = 100;
 
   private static final String COLUMN_TITLE_ID = "title_id";