Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support biblatex extended name format #11975

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
207 changes: 155 additions & 52 deletions src/main/java/org/jabref/logic/importer/AuthorListParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.regex.Matcher;
Expand Down Expand Up @@ -39,6 +40,11 @@ public class AuthorListParser {

private static final Pattern STARTS_WITH_CAPITAL_LETTER_DOT_OR_DASH = Pattern.compile("^[A-Z](\\.[ -]| ?-)");

private static final Pattern EXTENDED_NAME_FORMAT_PATTERN = Pattern.compile("(\\w+)\\s*=\\s*([^,]+)(?:,\\s*|$)");

private static final int AUTHOR_SEPARATOR_LENGTH = 5; // Length of " and "

private static final int NAME_SPLIT_INCREMENT = 4; // Increment after processing " and "
/**
* the raw bibtex author/editor field
*/
Expand Down Expand Up @@ -147,59 +153,37 @@ public AuthorList parse(@NonNull String listOfNames) {
listOfNames = simpleNormalForm.authors;
boolean andOthersPresent = simpleNormalForm.andOthersPresent;

// Handle case names in order lastname, firstname and separated by ","
// E.g., Ali Babar, M., Dingsøyr, T., Lago, P., van der Vliet, H.
final boolean authorsContainAND = listOfNames.toUpperCase(Locale.ENGLISH).contains(" AND ");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code was removed - therefore org.jabref.logic.formatter.bibtexfields.NormalizeNamesFormatterTest is failing. Please fix. 😅

final boolean authorsContainOpeningBrace = listOfNames.contains("{");
final boolean authorsContainSemicolon = listOfNames.contains(";");
final boolean authorsContainTwoOrMoreCommas = (listOfNames.length() - listOfNames.replace(",", "").length()) >= 2;
if (!authorsContainAND && !authorsContainOpeningBrace && !authorsContainSemicolon && authorsContainTwoOrMoreCommas) {
List<String> arrayNameList = Arrays.asList(listOfNames.split(","));

// Delete spaces for correct case identification
arrayNameList.replaceAll(String::trim);

// Looking for space between pre- and lastname
boolean spaceInAllParts = arrayNameList.stream().filter(name -> name.contains(" "))
.count() == arrayNameList.size();

// We hit the comma name separator case
// Usually the getAsLastFirstNamesWithAnd method would separate them if pre- and lastname are separated with "and"
// If not, we check if spaces separate pre- and lastname
if (spaceInAllParts) {
listOfNames = listOfNames.replace(",", " and");
} else {
// Looking for name affixes to avoid
// arrayNameList needs to reduce by the count off avoiding terms
// valuePartsCount holds the count of name parts without the avoided terms

int valuePartsCount = arrayNameList.size();
// Holds the index of each term which needs to be avoided
Collection<Integer> avoidIndex = new HashSet<>();

for (int i = 0; i < arrayNameList.size(); i++) {
if (AVOID_TERMS_IN_LOWER_CASE.contains(arrayNameList.get(i).toLowerCase(Locale.ROOT))) {
avoidIndex.add(i);
valuePartsCount--;
}
}
// Split the author list
List<String> authorsArray = splitAuthors(listOfNames);
List<Author> authors = new ArrayList<>(authorsArray.size());

if ((valuePartsCount % 2) == 0) {
// We hit the described special case with name affix like Jr
listOfNames = buildWithAffix(avoidIndex, arrayNameList).toString();
}
for (String authorString : authorsArray) {
authorString = authorString.trim();
Optional<Author> author = Optional.empty();
if (authorString.startsWith("family=")) {
// Try to parse using extended format
author = parseExtendedNameFormat(authorString);
}
}

// initialization of parser
original = listOfNames;
tokenStart = 0;
tokenEnd = 0;

// Parse author by author
List<Author> authors = new ArrayList<>(5); // 5 seems to be reasonable initial size
while (tokenStart < original.length()) {
getAuthor().ifPresent(authors::add);
if (author.isEmpty()) {
// Parse using getAuthor()
// Save current state
String savedOriginal = original;
int savedTokenStart = tokenStart;
int savedTokenEnd = tokenEnd;

// set original to authorString and reset token positions
original = authorString;
tokenStart = 0;
tokenEnd = 0;

author = getAuthor();

// restore original state
original = savedOriginal;
tokenStart = savedTokenStart;
tokenEnd = savedTokenEnd;
}
author.ifPresent(authors::add);
}

if (andOthersPresent) {
Expand All @@ -209,6 +193,96 @@ public AuthorList parse(@NonNull String listOfNames) {
return AuthorList.of(authors);
}

/**
* Attempts to parse a single author string using the BibLaTeX extended name format.
* The format includes attributes such as family, given, prefix, and suffix.
* Also handles the 'useprefix=false' case where the prefix should be ignored.
*
* @param authorString the string representing a single author in extended format
* @return Optional containing an Author object if parsing is successful, or empty if not
*/
private Optional<Author> parseExtendedNameFormat(String authorString) {
Map<String, String> nameParts = new HashMap<>();
Matcher matcher = EXTENDED_NAME_FORMAT_PATTERN.matcher(authorString);
while (matcher.find()) {
nameParts.put(matcher.group(1).trim(), matcher.group(2).trim());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick: Named groups would make this code clearer

}

if (nameParts.isEmpty()) {
return Optional.empty();
}

String familyName = nameParts.get("family");
String givenName = nameParts.get("given");
String namePrefix = nameParts.get("prefix");
String nameSuffix = nameParts.get("suffix");
String usePrefix = nameParts.get("useprefix");

// handle useprefix=false
if ("false".equalsIgnoreCase(usePrefix)) {
namePrefix = null;
}

// abbreviate given name
String givenNameAbbreviated = abbreviateGivenName(givenName);
return Optional.of(new Author(givenName, givenNameAbbreviated, namePrefix, familyName, nameSuffix));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment doesn't say anything that the code doesn't say already, it is useless, please remove it.

}

/**
* Abbreviates the given name by taking the first letter of each word and appending a dot.
* Handles cases where the given name is already in an abbreviated format.
*
* @param givenName the given name string to abbreviate
* @return the abbreviated version of the given name
*/
private String abbreviateGivenName(String givenName) {
if (givenName == null || givenName.isEmpty()) {
return null;
}
String[] parts = givenName.trim().split("\\s+");
StringBuilder abbreviated = new StringBuilder();
for (int i = 0; i < parts.length; i++) {
if (!parts[i].isEmpty()) {
// check if the part is already an initial with a dot
if (parts[i].matches("[A-Z]\\.")) {
abbreviated.append(parts[i]);
} else {
abbreviated.append(parts[i].charAt(0)).append('.');
}
if (i < parts.length - 1) {
abbreviated.append(' ');
}
}
}
return abbreviated.toString();
}

/**
* Splits the author list into individual authors
*
* @param authorList the author list string
* @return a list of individual author strings
*/
private List<String> splitAuthors(String authorList) {
List<String> authors = new ArrayList<>();
int bracesLevel = 0;
int start = 0;
for (int i = 0; i < authorList.length(); i++) {
char c = authorList.charAt(i);
if (c == '{') {
bracesLevel++;
} else if (c == '}') {
bracesLevel--;
} else if (i <= authorList.length() - AUTHOR_SEPARATOR_LENGTH && authorList.substring(i, i + AUTHOR_SEPARATOR_LENGTH).equals(" and ") && bracesLevel == 0) {
authors.add(authorList.substring(start, i));
i += NAME_SPLIT_INCREMENT;
start = i + 1;
}
}
authors.add(authorList.substring(start));
return authors;
}

/**
* Handle cases names in order Firstname Lastname, separated by <code>","</code> and a final <code>", and "</code>
* E.g, <code>"I. Podadera, J. M. Carmona, A. Ibarra, and J. Molla"</code>
Expand Down Expand Up @@ -238,6 +312,35 @@ private static String checkNamesCommaSeparated(String listOfNames) {
* @return Preformatted author name; <CODE>Optional.empty()</CODE> if author name is empty.
*/
private Optional<Author> getAuthor() {
int savedTokenStart = tokenStart;

while (tokenStart < original.length() && Character.isWhitespace(original.charAt(tokenStart))) {
tokenStart++;
}
Comment on lines +317 to +319
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A comment above this block however would be helpful.


// check if the substring starting at tokenStart starts with "family="
if (original.startsWith("family=", tokenStart)) {
// try to parse up to the next 'and' or end of string
int indexOfAnd = original.indexOf(" and ", tokenStart);
String authorString;
if (indexOfAnd >= 0) {
authorString = original.substring(tokenStart, indexOfAnd).trim();
tokenStart = indexOfAnd + " and ".length();
} else {
authorString = original.substring(tokenStart).trim();
tokenStart = original.length();
}

Optional<Author> extendedAuthor = parseExtendedNameFormat(authorString);

if (extendedAuthor.isPresent()) {
return extendedAuthor;
} else {
// parsing failed so reset tokenStart
tokenStart = savedTokenStart;
}
}

List<Object> tokens = new ArrayList<>();
int vonStart = -1;
int lastStart = -1;
Expand Down
99 changes: 99 additions & 0 deletions src/test/java/org/jabref/logic/importer/AuthorListParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -88,4 +88,103 @@ private static Stream<Arguments> parseMultipleCorrectly() {
void parseMultipleCorrectly(AuthorList expected, String authorsString) {
assertEquals(expected, parser.parse(authorsString));
}

// tests for biblatex extended name format
// test parsing of multiple authors with various fields using BibLaTeX extended name format
private static Stream<Arguments> parseExtendedFormatAuthors() {
return Stream.of(
Arguments.of(
"family=Hasselt, given=Hado P., prefix=van, useprefix=false and family=Guez, given=Arthur and family=Hessel, given=Matteo and family=Mnih, given=Volodymyr and family=Silver, given=David",
AuthorList.of(
new Author("Hado P.", "H. P.", null, "Hasselt", null),
new Author("Arthur", "A.", null, "Guez", null),
new Author("Matteo", "M.", null, "Hessel", null),
new Author("Volodymyr", "V.", null, "Mnih", null),
new Author("David", "D.", null, "Silver", null)
)
),
Arguments.of(
"family=Hasselt, given=Hado P., prefix=van, useprefix=true and family=Smith, given=John",
AuthorList.of(
new Author("Hado P.", "H. P.", "van", "Hasselt", null),
new Author("John", "J.", null, "Smith", null)
)
),
Arguments.of(
"family=Ree, given=Michiel, prefix=van der and family=Wiering, given=Marco",
AuthorList.of(
new Author("Michiel", "M.", "van der", "Ree", null),
new Author("Marco", "M.", null, "Wiering", null)
)
),
Arguments.of(
"family=al-Ṣāliḥ, given=Abdallāh",
AuthorList.of(
new Author("Abdallāh", "A.", null, "al-Ṣāliḥ", null)
)
)
);
}

// Test the parsing of BibLaTeX extended format authors with various fields
@ParameterizedTest
@MethodSource
void parseExtendedFormatAuthors(String authorsString, AuthorList expected) {
assertEquals(expected, parser.parse(authorsString));
}

// Test parsing of a mixed format author string combining BibLaTeX format and regular format
@Test
void parseMixedFormatAuthors() {
String authorsString = "family=Hasselt, given=Hado P., prefix=van, useprefix=false and Guez, Arthur";
AuthorList expected = AuthorList.of(
new Author("Hado P.", "H. P.", null, "Hasselt", null),
new Author("Arthur", "A.", null, "Guez", null)
);
assertEquals(expected, parser.parse(authorsString));
}

// Test handling of an "useprefix=false" attribute ensuring that the prefix is ignored
@Test
void parseAuthorWithUsePrefixFalse() {
String authorsString = "family=Hasselt, given=Hado P., prefix=van, useprefix=false";
AuthorList expected = AuthorList.of(
new Author("Hado P.", "H. P.", null, "Hasselt", null)
);
assertEquals(expected, parser.parse(authorsString));
}

// Test handling of a "useprefix=true" attribute, ensuring that the prefix is included
@Test
void parseAuthorWithUsePrefixTrue() {
String authorsString = "family=Hasselt, given=Hado P., prefix=van, useprefix=true";
AuthorList expected = AuthorList.of(
new Author("Hado P.", "H. P.", "van", "Hasselt", null)
);
assertEquals(expected, parser.parse(authorsString));
}

// Test case where there is a missing comma between 'prefix' and 'useprefix'
@Test
void parseAuthorWithMissingComma() {
String authorsString = "family=Hasselt, given=Hado P., prefix=van useprefix=false";
// Since there is a missing comma between 'prefix=van' and 'useprefix=false', the parser should handle this
// In this test the parser will parse 'prefix=van useprefix=false' as one field
AuthorList expected = AuthorList.of(
new Author("Hado P.", "H. P.", "van useprefix=false", "Hasselt", null)
);
assertEquals(expected, parser.parse(authorsString));
}

// Test for handling incomplete BibLaTeX extended format with missing given name for one author
@Test
void parseExtendedFormatWithIncompleteData() {
String authorsString = "family=Smith, given=John and family=Doe";
AuthorList expected = AuthorList.of(
new Author("John", "J.", null, "Smith", null),
new Author(null, null, null, "Doe", null)
);
assertEquals(expected, parser.parse(authorsString));
}
}

Loading