Skip to content
This repository has been archived by the owner on Apr 20, 2022. It is now read-only.

Commit

Permalink
Fix issue kermitt2#339, more robustness for patent number parsing
Browse files Browse the repository at this point in the history
Former-commit-id: 639cb4e
  • Loading branch information
kermitt2 committed Aug 22, 2018
1 parent 9774692 commit bcc6369
Show file tree
Hide file tree
Showing 3 changed files with 2,073 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import java.util.*;
import java.util.regex.*;
import java.lang.NumberFormatException;

import org.apache.commons.io.IOUtils;
import org.grobid.core.data.PatentItem;
Expand All @@ -18,6 +19,9 @@
import java.io.InputStream;
import java.io.InputStreamReader;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Parser for patent references based on regular language rewriting.
* Input raw references are WISIWIG references (i.e. reference string as
Expand All @@ -28,6 +32,8 @@
*/

public class PatentRefParser {
private static final Logger LOGGER = LoggerFactory.getLogger(ReferenceExtractor.class);

private String rawText = null;
private int rawTextOffset = 0; // starting offset of the current raw text
private Pattern patent_pattern = null;
Expand Down Expand Up @@ -107,6 +113,7 @@ public PatentRefParser() {
provisional_pattern = compilePattern("provisional");
utility_pattern = compilePattern("utility");

// these patterns are now expressed in external resource files under grobid-home/lexicon/patent/
/*EP_pattern = Pattern.compile("((\\s|,|\\.|^|\\-)EPO?)|(E\\.(\\s)?P)|((E|e)uropean)|(européen)|(europ)");
DE_pattern = Pattern.compile("((\\s|,|\\.|^|\\-)DE)|(D\\.(\\s)?E)|((G|g)erman)|((D|d)eutsch)|(allemand)");
US_pattern = Pattern.compile("((\\s|,|\\.|^|\\-)US)|(U\\.(\\s)?S)|((U|u)nited(\\s|-)*(S|s)tate)|(USA)");
Expand Down Expand Up @@ -555,7 +562,14 @@ else if (numm.length() <= 7) {
number = number.substring(3, number.length());
number = number.replaceAll("[\\.\\s/,]", "");
// we check the range of the number for deciding about a year
int numb = Integer.parseInt(number);
int numb = -1;
try {
numb = Integer.parseInt(number);
} catch(NumberFormatException e) {
LOGGER.warn("Cannot parse extracted patent number: " + number);
}
if (numb == -1)
continue;
String year = null;
if (numb < 9474)
year = "1995";
Expand Down Expand Up @@ -592,7 +606,14 @@ else if (numb < 999999)
number = number.substring(3, number.length());
number = number.replaceAll("[\\.\\s/,]", "");
// we check the range of the number for deciding about a year
int numb = Integer.parseInt(number);
int numb = -1;
try {
numb = Integer.parseInt(number);
} catch(NumberFormatException e) {
LOGGER.warn("Cannot parse extracted patent number: " + number);
}
if (numb == -1)
continue;
String year = null;
if (numb < 9389)
year = "2007";
Expand Down Expand Up @@ -620,12 +641,21 @@ else if (number.startsWith("62") && (appli || number.startsWith("62/"))) {
number = number.substring(3, number.length());
number = number.replaceAll("[\\.\\s/,]", "");
// we check the range of the number for deciding about a year
int numb = Integer.parseInt(number);
int numb = -1;
try {
numb = Integer.parseInt(number);
} catch(NumberFormatException e) {
LOGGER.warn("Cannot parse extracted patent number: " + number);
}
if (numb == -1)
continue;
String year = null;
if (numb < 124715)
year = "2014";
else
else if (numb < 387330)
year = "2015";
else
year = "2016";
number = year + "0" + number;
}
else if (number.startsWith("29") && (appli || number.startsWith("29/"))) {
Expand All @@ -637,7 +667,14 @@ else if (number.startsWith("29") && (appli || number.startsWith("29/"))) {
number = number.substring(3, number.length());
number = number.replaceAll("[\\.\\s/,]", "");
// we check the range of the number for deciding about a year
int numb = Integer.parseInt(number);
int numb = -1;
try {
numb = Integer.parseInt(number);
} catch(NumberFormatException e) {
LOGGER.warn("Cannot parse extracted patent number: " + number);
}
if (numb == -1)
continue;
String year = null;
if (numb < 3180)
year = "1992";
Expand Down Expand Up @@ -685,8 +722,10 @@ else if (numb < 463549)
year = "2013";
else if (numb < 474693)
year = "2014";
else
else if (numb < 505607)
year = "2015";
else
year = "2016";
number = year + "0" + number;
}
else if (number.startsWith("14") && (appli || number.startsWith("14/"))) {
Expand All @@ -697,12 +736,21 @@ else if (number.startsWith("14") && (appli || number.startsWith("14/"))) {
number = number.substring(3, number.length());
number = number.replaceAll("[\\.\\s/,]", "");
// we check the range of the number for deciding about a year
int numb = Integer.parseInt(number);
int numb = -1;
try {
numb = Integer.parseInt(number);
} catch(NumberFormatException e) {
LOGGER.warn("Cannot parse extracted patent number: " + number);
}
if (numb == -1)
continue;
String year = null;
if (numb < 544379)
year = "2014";
else if (numb < 757791)
year = "2015";
else
year = "2015";
year = "2016";
number = year + "0" + number;
}
else if (number.startsWith("13") && (appli || number.startsWith("13/"))) {
Expand All @@ -713,7 +761,14 @@ else if (number.startsWith("13") && (appli || number.startsWith("13/"))) {
number = number.substring(3, number.length());
number = number.replaceAll("[\\.\\s/,]", "");
// we check the range of the number for deciding about a year
int numb = Integer.parseInt(number);
int numb = -1;
try {
numb = Integer.parseInt(number);
} catch(NumberFormatException e) {
LOGGER.warn("Cannot parse extracted patent number: " + number);
}
if (numb == -1)
continue;
String year = null;
if (numb < 374487)
year = "2011";
Expand All @@ -732,7 +787,14 @@ else if (numb < 998975)
number = number.substring(3, number.length());
number = number.replaceAll("[\\.\\s/,]", "");
// we check the range of the number for deciding about a year
int numb = Integer.parseInt(number);
int numb = -1;
try {
numb = Integer.parseInt(number);
} catch(NumberFormatException e) {
LOGGER.warn("Cannot parse extracted patent number: " + number);
}
if (numb == -1)
continue;
String year = null;
if (numb < 5841)
year = "2007";
Expand All @@ -753,7 +815,14 @@ else if (numb < 930166)
number = number.substring(3, number.length());
number = number.replaceAll("[\\.\\s/,]", "");
// we check the range of the number for deciding about a year
int numb = Integer.parseInt(number);
int numb = -1;
try {
numb = Integer.parseInt(number);
} catch(NumberFormatException e) {
LOGGER.warn("Cannot parse extracted patent number: " + number);
}
if (numb == -1)
continue;
String year = null;
if (numb < 023305)
year = "2004";
Expand All @@ -772,7 +841,14 @@ else if (numb < 646743)
number = number.substring(3, number.length());
number = number.replaceAll("[\\.\\s/,]", "");
// we check the range of the number for deciding about a year
int numb = Integer.parseInt(number);
int numb = -1;
try {
numb = Integer.parseInt(number);
} catch(NumberFormatException e) {
LOGGER.warn("Cannot parse extracted patent number: " + number);
}
if (numb == -1)
continue;
String year = null;
if (numb < 32443)
year = "2001";
Expand All @@ -794,7 +870,14 @@ else if (numb < 746297)
number = number.substring(3, number.length());
number = number.replaceAll("[\\.\\s/,]", "");
// we check the range of the number for deciding about a year
int numb = Integer.parseInt(number);
int numb = -1;
try {
numb = Integer.parseInt(number);
} catch(NumberFormatException e) {
LOGGER.warn("Cannot parse extracted patent number: " + number);
}
if (numb == -1)
continue;
String year = null;
if (numb < 219723)
year = "1998";
Expand All @@ -816,7 +899,14 @@ else if (numb < 740756)
number = number.substring(3, number.length());
number = number.replaceAll("[\\.\\s/,]", "");
// we check the range of the number for deciding about a year
int numb = Integer.parseInt(number);
int numb = -1;
try {
numb = Integer.parseInt(number);
} catch(NumberFormatException e) {
LOGGER.warn("Cannot parse extracted patent number: " + number);
}
if (numb == -1)
continue;
String year = null;
if (numb < 176047)
year = "1993";
Expand All @@ -840,7 +930,14 @@ else if (numb < 777991)
number = number.substring(3, number.length());
number = number.replaceAll("[\\.\\s/,]", "");
// we check the range of the number for deciding about a year
int numb = Integer.parseInt(number);
int numb = -1;
try {
numb = Integer.parseInt(number);
} catch(NumberFormatException e) {
LOGGER.warn("Cannot parse extracted patent number: " + number);
}
if (numb == -1)
continue;
String year = null;
if (numb < 140321)
year = "1987";
Expand All @@ -866,7 +963,14 @@ else if (numb < 815501)
number = number.substring(3, number.length());
number = number.replaceAll("[\\.\\s/,]", "");
// we check the range of the number for deciding about a year
int numb = Integer.parseInt(number);
int numb = -1;
try {
numb = Integer.parseInt(number);
} catch(NumberFormatException e) {
LOGGER.warn("Cannot parse extracted patent number: " + number);
}
if (numb == -1)
continue;
String year = null;
if (numb < 108971)
year = "1979";
Expand Down Expand Up @@ -896,7 +1000,14 @@ else if (numb < 815454)
number = number.substring(3, number.length());
number = number.replaceAll("[\\.\\s/,]", "");
// we check the range of the number for deciding about a year
int numb = Integer.parseInt(number);
int numb = -1;
try {
numb = Integer.parseInt(number);
} catch(NumberFormatException e) {
LOGGER.warn("Cannot parse extracted patent number: " + number);
}
if (numb == -1)
continue;
String year = null;
if (numb < 103000)
year = "1970";
Expand Down Expand Up @@ -928,7 +1039,14 @@ else if (numb < 866211)
number = number.substring(3, number.length());
number = number.replaceAll("[\\.\\s/,]", "");
// we check the range of the number for deciding about a year
int numb = Integer.parseInt(number);
int numb = -1;
try {
numb = Integer.parseInt(number);
} catch(NumberFormatException e) {
LOGGER.warn("Cannot parse extracted patent number: " + number);
}
if (numb == -1)
continue;
String year = null;
if (numb < 80000)
year = "1960";
Expand Down Expand Up @@ -961,7 +1079,14 @@ else if (numb < 788000)
number = number.substring(3, number.length());
number = number.replaceAll("[\\.\\s/,]", "");
// we check the range of the number for deciding about a year
int numb = Integer.parseInt(number);
int numb = -1;
try {
numb = Integer.parseInt(number);
} catch(NumberFormatException e) {
LOGGER.warn("Cannot parse extracted patent number: " + number);
}
if (numb == -1)
continue;
String year = null;
if (numb < 68000)
year = "1948";
Expand Down Expand Up @@ -997,7 +1122,14 @@ else if (numb < 784000)
number = number.substring(3, number.length());
number = number.replaceAll("[\\.\\s/,]", "");
// we check the range of the number for deciding about a year
int numb = Integer.parseInt(number);
int numb = -1;
try {
numb = Integer.parseInt(number);
} catch(NumberFormatException e) {
LOGGER.warn("Cannot parse extracted patent number: " + number);
}
if (numb == -1)
continue;
String year = null;
if (numb < 57000)
year = "1935";
Expand Down Expand Up @@ -1036,7 +1168,14 @@ else if (numb < 719000)
number = number.substring(3, number.length());
number = number.replaceAll("[\\.\\s/,]", "");
// we check the range of the number for deciding about a year
int numb = Integer.parseInt(number);
int numb = -1;
try {
numb = Integer.parseInt(number);
} catch(NumberFormatException e) {
LOGGER.warn("Cannot parse extracted patent number: " + number);
}
if (numb == -1)
continue;
String year = null;
/*if (numb < 70000)
year = "1915";
Expand Down
Loading

0 comments on commit bcc6369

Please sign in to comment.