Fix #2775: Hyphens in last names are properly parsed (#3209)

JabRef · Sep 12, 2017 · c9445d4 · c9445d4
1 parent 58fec29
commit c9445d4
Show file tree

Hide file tree

Showing 3 changed files with 70 additions and 33 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -40,6 +40,7 @@ We refer to [GitHub issues](https://github.com/JabRef/jabref/issues) by using `#
 - We fixed an issue where metadata syncing with local and shared database were unstable. It will also fix syncing groups and sub-groups in database. [#2284](https://github.com/JabRef/jabref/issues/2284)
 - We fixed an issue where it was possible to leave the entry editor with an imbalance of braces. [#3167](https://github.com/JabRef/jabref/issues/3167)
 - Renaming files now truncates the filename to not exceed the limit of 255 chars [#2622](https://github.com/JabRef/jabref/issues/2622)
+- We improved the handling of hyphens in names. [#2775](https://github.com/JabRef/jabref/issues/2775)
 
 ### Removed
 - We removed support for LatexEditor, as it is not under active development. [#3199](https://github.com/JabRef/jabref/issues/3199)

diff --git a/src/main/java/org/jabref/model/entry/AuthorListParser.java b/src/main/java/org/jabref/model/entry/AuthorListParser.java
@@ -32,25 +32,6 @@ public class AuthorListParser {
     // Constant HashSet containing names of TeX special characters
     private static final Set<String> TEX_NAMES = new HashSet<>();
 
-    /** the raw bibtex author/editor field */
-    private String original;
-
-    /** index of the start in original, for example to point to 'abc' in 'abc xyz', tokenStart=2 */
-    private int tokenStart;
-
-    /** index of the end in original, for example to point to 'abc' in 'abc xyz', tokenEnd=5 */
-    private int tokenEnd;
-
-    /** end of token abbreviation (always: tokenStart < tokenAbbr <= tokenEnd), only valid if getToken returns TOKEN_WORD */
-    private int tokenAbbr;
-
-
-    /** either space of dash */
-    private char tokenTerm;
-
-    /** true if upper-case token, false if lower-case */
-    private boolean tokenCase;
-
     static {
         TEX_NAMES.add("aa");
         TEX_NAMES.add("ae");
@@ -66,6 +47,32 @@ public class AuthorListParser {
         TEX_NAMES.add("j");
     }
 
+    /**
+     * the raw bibtex author/editor field
+     */
+    private String original;
+    /**
+     * index of the start in original, for example to point to 'abc' in 'abc xyz', tokenStart=2
+     */
+    private int tokenStart;
+    /**
+     * index of the end in original, for example to point to 'abc' in 'abc xyz', tokenEnd=5
+     */
+    private int tokenEnd;
+    /**
+     * end of token abbreviation (always: tokenStart < tokenAbbrEnd <= tokenEnd), only valid if getToken returns
+     * TOKEN_WORD
+     */
+    private int tokenAbbrEnd;
+    /**
+     * either space of dash
+     */
+    private char tokenTerm;
+    /**
+     * true if upper-case token, false if lower-case
+     */
+    private boolean tokenCase;
+
     /**
      * Parses the String containing person names and returns a list of person information.
      *
@@ -121,7 +128,7 @@ private Optional<Author> getAuthor() {
                 break;
             case TOKEN_WORD:
                 tokens.add(original.substring(tokenStart, tokenEnd));
-                tokens.add(original.substring(tokenStart, tokenAbbr));
+                tokens.add(original.substring(tokenStart, tokenAbbrEnd));
                 tokens.add(tokenTerm);
                 tokens.add(tokenCase);
                 if (commaFirst >= 0) {
@@ -137,6 +144,13 @@ private Optional<Author> getAuthor() {
                             // We are in a first name which contained a hyphen
                             break;
                         }
+
+                        int thisTermToken = previousTermToken + TOKEN_GROUP_LENGTH;
+                        if ((thisTermToken >= 0) && tokens.get(thisTermToken).equals('-')) {
+                            // We are in a name which contained a hyphen
+                            break;
+                        }
+
                         vonStart = tokens.size() - TOKEN_GROUP_LENGTH;
                         break;
                     }
@@ -194,14 +208,16 @@ private Optional<Author> getAuthor() {
                     firstPartStart = 0;
                 }
             }
-        } else { // commas are present: it affects only 'first part' and
-            // 'junior part'
+        } else {
+            // commas are present: it affects only 'first part' and 'junior part'
             firstPartEnd = tokens.size();
-            if (commaSecond < 0) { // one comma
+            if (commaSecond < 0) {
+                // one comma
                 if (commaFirst < firstPartEnd) {
                     firstPartStart = commaFirst;
                 }
-            } else { // two or more commas
+            } else {
+                // two or more commas
                 if (commaSecond < firstPartEnd) {
                     firstPartStart = commaSecond;
                 }
@@ -342,7 +358,7 @@ private int getToken() {
             tokenEnd++;
             return TOKEN_AND;
         }
-        tokenAbbr = -1;
+        tokenAbbrEnd = -1;
         tokenTerm = ' ';
         tokenCase = true;
         int bracesLevel = 0;
@@ -353,8 +369,9 @@ private int getToken() {
             if (c == '{') {
                 bracesLevel++;
             }
-            if (firstLetterIsFound && (tokenAbbr < 0) && ((bracesLevel == 0) || (c == '{'))) {
-                tokenAbbr = tokenEnd;
+
+            if (firstLetterIsFound && (tokenAbbrEnd < 0) && ((bracesLevel == 0) || (c == '{'))) {
+                tokenAbbrEnd = tokenEnd;
             }
             if ((c == '}') && (bracesLevel > 0)) {
                 bracesLevel--;
@@ -388,8 +405,8 @@ private int getToken() {
             }
             tokenEnd++;
         }
-        if (tokenAbbr < 0) {
-            tokenAbbr = tokenEnd;
+        if (tokenAbbrEnd < 0) {
+            tokenAbbrEnd = tokenEnd;
         }
         if ((tokenEnd < original.length()) && (original.charAt(tokenEnd) == '-')) {
             tokenTerm = '-';

diff --git a/src/test/java/org/jabref/model/entry/AuthorListTest.java b/src/test/java/org/jabref/model/entry/AuthorListTest.java
@@ -7,6 +7,10 @@
 
 public class AuthorListTest {
 
+    public static int size(String bibtex) {
+        return AuthorList.parse(bibtex).getNumberOfAuthors();
+    }
+
     @Test
     public void testFixAuthorNatbib() {
         Assert.assertEquals("", AuthorList.fixAuthorNatbib(""));
@@ -286,10 +290,6 @@ public void testFixAuthorForAlphabetization() {
                         .fixAuthorForAlphabetization("John von Neumann and John Smith and de Black Brown, Jr., Peter"));
     }
 
-    public static int size(String bibtex) {
-        return AuthorList.parse(bibtex).getNumberOfAuthors();
-    }
-
     @Test
     public void testSize() {
 
@@ -625,6 +625,25 @@ public void parseNameWithHyphenInLastName() throws Exception {
         Assert.assertEquals(new AuthorList(expected), AuthorList.parse("Firstname Bailey-Jones"));
     }
 
+    @Test
+    public void parseNameWithHyphenInLastNameWithInitials() throws Exception {
+        Author expected = new Author("E. S.", "E. S.", null, "El-{M}allah", null);
+        Assert.assertEquals(new AuthorList(expected), AuthorList.parse("E. S. El-{M}allah"));
+    }
+
+    @Test
+    public void parseNameWithHyphenInLastNameWithEscaped() throws Exception {
+        Author expected = new Author("E. S.", "E. S.", null, "{K}ent-{B}oswell", null);
+        Assert.assertEquals(new AuthorList(expected), AuthorList.parse("E. S. {K}ent-{B}oswell"));
+    }
+
+    @Test
+    public void parseNameWithHyphenInLastNameWhenLastNameGivenFirst() throws Exception {
+        // TODO: Fix abbreviation to be "A."
+        Author expected = new Author("ʿAbdallāh", "ʿ.", null, "al-Ṣāliḥ", null);
+        Assert.assertEquals(new AuthorList(expected), AuthorList.parse("al-Ṣāliḥ, ʿAbdallāh"));
+    }
+
     @Test
     public void parseNameWithBraces() throws Exception {
         Author expected = new Author("H{e}lene", "H.", null, "Fiaux", null);