Skip to content

Commit

Permalink
Merge pull request #1 from Dankoy/batch
Browse files Browse the repository at this point in the history
Updated readme
  • Loading branch information
Dankoy authored Jul 6, 2024
2 parents 37ddd8b + 8e7f6b0 commit 89e566e
Show file tree
Hide file tree
Showing 9 changed files with 206 additions and 65 deletions.
14 changes: 4 additions & 10 deletions .github/workflows/dependencies_check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
# This workflow will build a Java project with Gradle and cache/restore any dependencies to improve the workflow execution time
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-java-with-gradle

name: Gradle CI clean build
name: Maven CI clean build

on:
pull_request:
branches: [ "dependencies", "main", "dev" ]
branches: [ "master", "dev" ]

permissions:
contents: read
Expand All @@ -27,11 +27,5 @@ jobs:
java-version: '21'
distribution: 'temurin'

- name: gradle validation
uses: gradle/actions/wrapper-validation@v3

- name: Setup Gradle
uses: gradle/actions/setup-gradle@v3

- name: Run the tests
run: ./gradlew test
- name: Run the Maven verify phase
run: mvn --batch-mode --update-snapshots verify
25 changes: 6 additions & 19 deletions .github/workflows/gradle.yml → .github/workflows/maven.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
# This workflow will build a Java project with Gradle and cache/restore any dependencies to improve the workflow execution time
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-java-with-gradle

name: Java CI with Gradle
name: Java CI with Maven

on:
push:
branches: [ "main" ]
branches: [ "master" ]

permissions:
contents: read
Expand All @@ -28,21 +28,8 @@ jobs:
java-version: '21'
distribution: 'temurin'

- name: gradle validation
uses: gradle/actions/wrapper-validation@v3
- name: Run the Maven verify phase
run: mvn --batch-mode --update-snapshots verify

- name: Setup Gradle
uses: gradle/actions/setup-gradle@v3

- name: Run the tests
run: ./gradlew test

- name: Run build
run: ./gradlew bootJar

- name: Save result jar for further use
uses: actions/upload-artifact@v4
with:
name: app.jar
path: build/libs/*
retention-days: 5
- name: Build
run: mvn package
23 changes: 0 additions & 23 deletions .github/workflows/rebase_dependencies.yml

This file was deleted.

4 changes: 2 additions & 2 deletions .github/workflows/super-linter-full.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ name: Full Lint Code Base

on:
push:
branches: [ "main" ]
branches: [ "master" ]

permissions: read-all

Expand Down Expand Up @@ -40,5 +40,5 @@ jobs:
VALIDATE_SQL: false
VALIDATE_SQLFLUFF: false
FILTER_REGEX_EXCLUDE: "gradlew"
DEFAULT_BRANCH: main
DEFAULT_BRANCH: master
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
4 changes: 2 additions & 2 deletions .github/workflows/super-linter.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ name: Lint Code Base

on:
pull_request:
branches: [ "main" ]
branches: [ "master" ]

permissions: read-all

Expand Down Expand Up @@ -40,5 +40,5 @@ jobs:
VALIDATE_SQL: false
VALIDATE_SQLFLUFF: false
FILTER_REGEX_EXCLUDE: "gradlew"
DEFAULT_BRANCH: main
DEFAULT_BRANCH: master
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
166 changes: 165 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,166 @@
# korvo-to-anki-lemmatizer
Lemmatizer for korvo-to-anki

[![GitHub Release](https://img.shields.io/github/v/release/dankoy/korvo-to-anki-lemmatizer)](https://github.com/Dankoy/korvo-to-anki-lemmatizer/releases/latest)
![GitHub Release Date](https://img.shields.io/github/release-date/dankoy/korvo-to-anki-lemmatizer)
![GitHub Downloads (all assets, all releases)](https://img.shields.io/github/downloads/dankoy/korvo-to-anki-lemmatizer/total)

[![Maven CI with Gradle](https://github.com/Dankoy/korvo-to-anki-lemmatizer/actions/workflows/gradle.yml/badge.svg?branch=master)](https://github.com/Dankoy/korvo-to-anki-lemmatizer/actions/workflows/maven.yml)

[![Hits](https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgit.luolix.top%2FDankoy%2Fkorvo-to-anki-lemmatizer&count_bg=%2379C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=hits&edge_flat=false)](https://hits.seeyoufarm.com)
![GitHub search hit counter](https://img.shields.io/github/search/dankoy/korvo-to-anki-lemmatizer/korvo-to-anki-lemmatizer)

Lemmatizer for [korvo-to-anki](https://github.com/Dankoy/korvo-to-anki).

Uses Stanford CoreNLP lib for lemmatization https://stanfordnlp.github.io/CoreNLP/lemma.html.

# Purpose

If you like me translated words as is in KOReader, then you have many words with like 'was' or '
maintained'. Also such words can be duplicated by lemma of such words (vocabulary_builder plugin
never checks that). But anki cards has to have lemmas of such words. For example lemma for word '
was' is 'be' and lemma for word 'maintained' is 'maintain'.

So this app checks for such words and rewrites it in lemmas.

Lemmatization works only for strings containing one word or hyphen.

# Usage

Functionality:

1) Check duplicated words and delete them in db
2) Check for words that already has lemmas in db and delete them from db.
3) Get lemmas from words and update them in db

To correctly use this project it's necessary to keep in mind that backups are absolutely necessary,
and automatic operations for duplicates removal should be done on your own risk. You can check
duplicates before deleting them manually or automatically. Application never tries to figure out
which word is better to be kept and which has to be deleted. It just deletes randomly if you have
multiple duplicates.

## More on duplicates

Words like 'maintained', 'maintains' are considered to be duplicates because they both have lemma
as 'maintain'. Application will take one of them and print, so you can check and manually, or you
can run command to delete one of the word automatically.

Also you can have words like 'maintain' and 'maintained'. So you have already lemma in db, and that
means, you don't need to find lemma for word 'maintained'. Application will take such words, print
them, so you can check and delete manually, or you can run command to delete them automatically.

## Run

#### On linux

`java -jar -Dspring.datasource.url=jdbc:sqlite:/path/to/vocabulary_builder.sqlite3 korvo-to-anki-lemmatizer.jar `

#### On windows

`java "-Dspring.datasource.url=jdbc:sqlite:\path\to\vocabulary_builder.sqlite3" -jar .\korvo-to-anki-lemmatizer.jar`


## Commands

```
lemmatize
* lemmatize-all-vocabularies, lav: Lemmatize vocabularies. Should be used only if check-on-duplicates command returns empty duplicates
check-existing-lemmas-if-exists, celie: Check if db contains lemmas that could be ignored
* auto-delete-duplicates, add: Automatically delete duplicates. Do on your own risk.
check-on-duplicates, cod: Check lemmas on duplicates
* auto-delete-words-lemmas-exists, adwle: Automatically delete words that already has lemmas in db for. Do on your own risk.
```

## 1. Check on exising lemmas

First check on already existing lemmas in db.

```shell
shell:>celie
17:57:38.607 [main] INFO com.zaxxer.hikari.HikariDataSource - HikariPool-1 - Starting...
17:57:38.949 [main] INFO com.zaxxer.hikari.pool.HikariPool - HikariPool-1 - Added connection org.sqlite.jdbc4.JDBC4Connection@113dcaf8
17:57:38.951 [main] INFO com.zaxxer.hikari.HikariDataSource - HikariPool-1 - Start completed.
[
{
"word" : "wretches",
"lemma" : "wretch",
"title" : "The Sorrows of Satan (Horror Classic)"
}, {
"word" : "keenest",
"lemma" : "keen",
"title" : "Orcs"
}
]
```

If the list is not empty then delete duplicates manually or go to next step.

## 2. Automatically remove words for which lemmas are present

This action cannot be undone.

```shell
shell:>adwle
18:02:05.785 [main] INFO r.d.k.c.d.v.v.VocabularyDaoJdbc - Batch size: 100
...
```

After deleting such words go to step 3

## 3. Check on duplicates

```shell
shell:>cod
18:03:52.271 [main] INFO com.zaxxer.hikari.HikariDataSource - HikariPool-1 - Starting...
18:03:52.486 [main] INFO com.zaxxer.hikari.pool.HikariPool - HikariPool-1 - Added connection org.sqlite.jdbc4.JDBC4Connection@39f0c343
18:03:52.488 [main] INFO com.zaxxer.hikari.HikariDataSource - HikariPool-1 - Start completed.
[ {
"word" : "lurking",
"lemma" : "lurk",
"title" : "The Sorrows of Satan (Horror Classic)"
}, {
"word" : "Drapes",
"lemma" : "drape",
"title" : "Orcs"
}
]
```

If empty go to step 5, if not delete manually or go to step 4

## 4. Automatically delete duplicates

```shell
shell:>add
18:03:55.322 [main] INFO r.d.k.c.d.v.v.VocabularyDaoJdbc - Batch size: 53
...

```

After deleting got to step 5

## 5. Lemmatize your db

```shell
shell:>lav
18:04:19.608 [main] INFO r.d.k.c.d.v.v.VocabularyDaoJdbc - Batch size: 100
18:04:19.608 [main] INFO r.d.k.c.d.v.v.VocabularyDaoJdbc - Batch: [VocabularyLemmaFullDTO[word=dripping, lemma=drip, title=Title[id=1, name=Orcs, filter=1], createTime=1658041927, reviewTime=0, dueTime=1658042227, reviewCount=0, prevContext=null, nextContext=null, streakCount=0], VocabularyLemmaFullDTO[word=marshlands, lemma=marshland,
...
```
## Mentions
1) You can't run lemmatizing command if duplicates of any kind exists.
```shell
shell:>lav
Command 'lemmatize-all-vocabularies' exists but is not currently available because first check and fix all duplicates
Details of the error have been omitted. You can use the stacktrace command to print the full stacktrace.
```
## 6. Move vocabulary builder db
Check if lemmas satisfy you, make another backup of old database.
Move updated database into your reader, check if everything works.
Run [korvo-to-anki](https://github.com/Dankoy/korvo-to-anki) app again if necessary.
20 changes: 17 additions & 3 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,30 @@

<groupId>ru.dankoy</groupId>
<artifactId>korvo-to-anki-lemmatizer</artifactId>
<version>0.1.0-SNAPSHOT</version>
<version>0.1.0</version>
<name>korvo-to-anki-lemmatizer</name>
<description>korvo-to-anki-lemmatizer</description>
<packaging>jar</packaging>

<licenses>
<license/>
<license>
<name>BSD-3-Clause</name>
<url>https://spdx.org/licenses/BSD-3-Clause.html</url>
<distribution>repo</distribution>
</license>
</licenses>
<developers>
<developer/>
<developer>
<id>Dankoy</id>
<name>Dankoy</name>
<email>-</email>
<url>https://github.com/Dankoy</url>
<roles>
<role>maintainer</role>
<role>developer</role>
</roles>
<timezone>Europe/Moscow</timezone>
</developer>
</developers>

<properties>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
@Command(group = "lemmatize")
public class LemmatizerCommand {

private boolean lemmatizeAvailable = false;
private boolean duplicatesCleaned = false;
private boolean duplicatesForExistingLemmasCleaned = false;
private List<VocabularyLemmaDTO> duplicates = new ArrayList<>();
private List<VocabularyLemmaDTO> alreadyExistingLemmas = new ArrayList<>();

Expand All @@ -47,7 +48,7 @@ public String checkOnDuplicates() {
duplicates = dtos.stream().filter(n -> !elements.add(n)).toList();

if (duplicates.isEmpty()) {
lemmatizeAvailable = true;
duplicatesCleaned = true;
}

return objectMapperService.convertToStringPrettyPrint(duplicates);
Expand Down Expand Up @@ -87,6 +88,10 @@ public String checkExistingLemmas() {
s -> s.word().equals(dto.lemma()))) // filter if lemma equals word in db
.toList();

if (alreadyExistingLemmas.isEmpty()) {
duplicatesForExistingLemmasCleaned = true;
}

return objectMapperService.convertToStringPrettyPrint(alreadyExistingLemmas);
}

Expand Down Expand Up @@ -123,7 +128,7 @@ public String lemmatizeAllVocabularies() {
@Bean
public AvailabilityProvider lemmatizeAvailability() {
return () ->
lemmatizeAvailable
(duplicatesCleaned && duplicatesForExistingLemmasCleaned)
? Availability.available()
: Availability.unavailable("first check and fix all duplicates");
}
Expand All @@ -141,6 +146,6 @@ public AvailabilityProvider lemmasExists() {
return () ->
!alreadyExistingLemmas.isEmpty()
? Availability.available()
: Availability.unavailable("some lemmas already exists, fix it");
: Availability.unavailable("no duplicates");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
@RequiredArgsConstructor
public class VocabularyDaoJdbc implements VocabularyDao {

private static final int BATCH_SIZE = 5;
private static final int BATCH_SIZE = 100;

private static final String COLUMN_TITLE_ID = "title_id";

Expand Down

0 comments on commit 89e566e

Please sign in to comment.