Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Language detection in Java #541

Merged
merged 2 commits into from
Feb 3, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions redpen-core/src/main/java/cc/redpen/util/LanguageDetector.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package cc.redpen.util;

import static cc.redpen.util.StringUtils.isProbablyJapanese;
import static java.lang.Math.min;

public class LanguageDetector {
public String detectLanguage(String text) {
if (!hasJapaneseCharacters(text)) return "en";

boolean zenkaku = text.indexOf('。') >= 0 || text.indexOf('、') >= 0 || text.indexOf('!') >= 0 || text.indexOf('?') >= 0;
boolean zenkaku2 = text.indexOf('.') >= 0 || text.indexOf(',') >= 0;
boolean hankaku = text.indexOf('.') >= 0 || text.indexOf(',') >= 0 || text.indexOf('!') >= 0 || text.indexOf('?') >= 0;

return zenkaku ? "ja" :
zenkaku2 ? "ja.zenkaku2" :
hankaku ? "ja.hankaku":
"ja";
}

private boolean hasJapaneseCharacters(String text) {
char[] chars = text.toCharArray();
for (int i = 0; i < min(chars.length, 100); i++) {
char c = chars[i];
if (isProbablyJapanese(c)) return true;
}
return false;
}
}
17 changes: 16 additions & 1 deletion redpen-core/src/main/java/cc/redpen/util/StringUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,24 @@
*/
package cc.redpen.util;

import static java.lang.Character.UnicodeBlock.*;

public class StringUtils {
public static boolean isKatakana(char c) {
return Character.UnicodeBlock.of(c) == Character.UnicodeBlock.KATAKANA;
return Character.UnicodeBlock.of(c) == KATAKANA;
}

public static boolean isHiragana(char c) {
return Character.UnicodeBlock.of(c) == HIRAGANA;
}

public static boolean isCJK(char c) {
return Character.UnicodeBlock.of(c) == CJK_UNIFIED_IDEOGRAPHS;
}

public static boolean isProbablyJapanese(char c) {
Character.UnicodeBlock block = Character.UnicodeBlock.of(c);
return block == KATAKANA || block == HIRAGANA || block == CJK_UNIFIED_IDEOGRAPHS;
}

public static boolean isBasicLatin(char c) {
Expand Down
50 changes: 50 additions & 0 deletions redpen-core/src/test/java/cc/redpen/util/LanguageDetectorTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package cc.redpen.util;

import org.junit.Test;

import static org.junit.Assert.assertEquals;

public class LanguageDetectorTest {
LanguageDetector detector = new LanguageDetector();

@Test
public void englishIsDefault() throws Exception {
assertEquals("en", detector.detectLanguage("Hello there!"));
}

@Test
public void japaneseIsDetectedForKatakana() throws Exception {
assertEquals("ja", detector.detectLanguage("コンピューター"));
}

@Test
public void japaneseIsDetectedForHiragana() throws Exception {
assertEquals("ja", detector.detectLanguage("はなぢ"));
}

@Test
public void japaneseIsDetectedForKanji() throws Exception {
assertEquals("ja", detector.detectLanguage("日本"));
assertEquals("ja", detector.detectLanguage("最近"));
}

@Test
public void zenkakuIsDefaultJapanese() throws Exception {
assertEquals("ja", detector.detectLanguage("最近利用されているソフトウェアの中には"));
}

@Test
public void zenkaku2IsDetectedUsingPunctuation() throws Exception {
assertEquals("ja.zenkaku2", detector.detectLanguage("こんにちは世界."));
}

@Test
public void zenkakuIsSelectedEvenIfSomeZenkaku2SymbolsArePresent() throws Exception {
assertEquals("ja", detector.detectLanguage("こんにちは世界. こんにちは世界。"));
}

@Test
public void hankakuIsSelectedIfAsciiPunctuationIsUsedWithJapaneseSymbols() throws Exception {
assertEquals("ja.hankaku", detector.detectLanguage("こんにちは世界!"));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@
import cc.redpen.model.Document;
import cc.redpen.parser.DocumentParser;
import cc.redpen.util.FormatterUtils;
import cc.redpen.util.LanguageDetector;
import cc.redpen.validator.ValidationError;
import org.apache.wink.common.annotations.Workspace;
import org.json.JSONException;
import org.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -37,7 +39,6 @@
import javax.ws.rs.*;
import javax.ws.rs.core.Context;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
Expand All @@ -60,6 +61,20 @@ public class RedPenResource {
@Context
private ServletContext context;

/**
* Detect language of document
*
* @param document the source text of the document
*/
@Path("/language")
@POST
@Produces(MediaType.APPLICATION_JSON)
@WinkAPIDescriber.Description("Detect language of document")
public JSONObject detectLanguage(@FormParam("document") @DefaultValue("") String document) throws JSONException {
String language = new LanguageDetector().detectLanguage(document);
return new JSONObject().put("key", language);
}

/**
* Validate a source document posted from a form
*
Expand All @@ -75,7 +90,7 @@ public class RedPenResource {
@POST
@Produces(MediaType.APPLICATION_JSON)
@WinkAPIDescriber.Description("Validate a document and return any redpen errors")
public Response validateDocument(@FormParam("document") @DefaultValue("") String document,
public String validateDocument(@FormParam("document") @DefaultValue("") String document,
@FormParam("documentParser") @DefaultValue(DEFAULT_DOCUMENT_PARSER) String documentParser,
@FormParam("lang") @DefaultValue(DEFAULT_CONFIGURATION) String lang,
@FormParam("format") @DefaultValue(DEFAULT_FORMAT) String format,
Expand All @@ -97,8 +112,7 @@ public Response validateDocument(@FormParam("document") @DefaultValue("") String
throw new RedPenException("Unsupported format: " + format + " - please use xml, plain, plain2, json or json2");
}

String responseJSON = formatter.format(parsedDocument, errors);
return Response.ok().entity(responseJSON).build();
return formatter.format(parsedDocument, errors);
}


Expand All @@ -120,7 +134,7 @@ public Response validateDocument(@FormParam("document") @DefaultValue("") String
@Consumes(MediaType.APPLICATION_JSON)
@Produces(MediaType.APPLICATION_JSON)
@WinkAPIDescriber.Description("Process a redpen JSON validation request and returns any redpen errors")
public Response validateDocumentJSON(JSONObject requestJSON) throws RedPenException {
public String validateDocumentJSON(JSONObject requestJSON) throws RedPenException {

LOG.info("Validating document using JSON request");
String documentParser = getOrDefault(requestJSON, "documentParser", DEFAULT_DOCUMENT_PARSER);
Expand Down Expand Up @@ -199,8 +213,7 @@ public Response validateDocumentJSON(JSONObject requestJSON) throws RedPenExcept
throw new RedPenException("Unsupported format: " + format + " - please use xml, plain, plain2, json or json2");
}

String responseJSON = formatter.format(parsedDocument, errors);
return Response.ok().entity(responseJSON).build();
return formatter.format(parsedDocument, errors);
}

private String getOrDefault(JSONObject json, String property, String defaultValue) {
Expand Down
13 changes: 6 additions & 7 deletions redpen-server/src/main/webapp/js/redpen.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,22 +41,21 @@ var redpen = (function ($) {
if (callback) {
callback(data);
}
},
dataType: "json"
}
}).fail(function (err) {
console.log(err);
});
console.error(err);
});
};

this.setBaseUrl = function(url) {
baseUrl = url;
};

// placeholder (and cheap client-side implementation) of a detect-language function
this.detectLanguage = function (text, callback) {
if (text) {
var japanese = (text.indexOf('。') != -1) || (text.indexOf('、') != -1) || (text.indexOf('は') != -1);
callback(japanese ? 'ja' : 'en');
doAPICall('document/language', {document: text}, function(data) {
callback(data.key);
}, 'POST');
}
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,11 @@ public void testJSValidatorRuns() throws Exception {
}
}

public void testDetectLanguage() throws Exception {
assertEquals("en", new RedPenResource().detectLanguage("Hello World").getString("key"));
assertEquals("ja", new RedPenResource().detectLanguage("こんにちは世界").getString("key"));
}

// test helper
private MockHttpServletRequest constructMockRequest(String method,
String requestURI,
Expand Down