From 68f6cb9e5f93806d83d3ebfd971ee27986a0c9ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirco=20Sch=C3=B6nfeld?= Date: Thu, 11 Jul 2019 21:43:02 +0200 Subject: [PATCH] BibTex: new parser (#2137) added a bibtex parser that extracts identifiers of entries in bib-files --- .../bib-simple.d/expected.tags | 14 + Units/parser-bibtex.r/bib-simple.d/input.bib | 97 ++++ docs/news.rst | 1 + main/parsers_p.h | 1 + parsers/bibtex.c | 425 ++++++++++++++++++ source.mak | 1 + win32/ctags_vs2013.vcxproj | 1 + win32/ctags_vs2013.vcxproj.filters | 3 + 8 files changed, 543 insertions(+) create mode 100644 Units/parser-bibtex.r/bib-simple.d/expected.tags create mode 100644 Units/parser-bibtex.r/bib-simple.d/input.bib create mode 100644 parsers/bibtex.c diff --git a/Units/parser-bibtex.r/bib-simple.d/expected.tags b/Units/parser-bibtex.r/bib-simple.d/expected.tags new file mode 100644 index 0000000000..3717d74cb2 --- /dev/null +++ b/Units/parser-bibtex.r/bib-simple.d/expected.tags @@ -0,0 +1,14 @@ +1957-doe_loc_ident input.bib /^@article{1957-doe_loc_ident,$/;" a +1959-rocket_exploration input.bib /^@INPROCEEDINGS{1959-rocket_exploration,$/;" j +1960-doe-location_splits input.bib /^@incollection{1960-doe-location_splits,$/;" I +1960-rocket_deep-exploration input.bib /^@conference{1960-rocket_deep-exploration,$/;" c +1960_conf_splits input.bib /^@proceedings{1960_conf_splits,$/;" P +1961-doe-diverse_splits input.bib /^@inbook{1961-doe-diverse_splits,$/;" i +1961_splits input.bib /^@techreport{1961_splits$/;" t +doe+rocket input.bib /^@Book{doe+rocket,$/;" b +doe_mastersth input.bib /^@mastersthesis{doe_mastersth,$/;" M +doe_mastersth_data input.bib /^@misc{doe_mastersth_data,$/;" n +doe_phd input.bib /^@phdthesis{doe_phd,$/;" p +man_loc_splits input.bib /^@manual{man_loc_splits,$/;" m +tiny_collect input.bib /^@booklet{tiny_collect,$/;" B +xx_thoughts input.bib /^@unpublished{xx_thoughts,$/;" u diff --git a/Units/parser-bibtex.r/bib-simple.d/input.bib b/Units/parser-bibtex.r/bib-simple.d/input.bib new file mode 100644 index 0000000000..f5256d00b0 --- /dev/null +++ b/Units/parser-bibtex.r/bib-simple.d/input.bib @@ -0,0 +1,97 @@ +% this is a comment that will be ignored + +@article{1957-doe_loc_ident, + author = "John Doe, + title = "Exploration of the + Location-Identity Split", + journal = "Journal of Splits", + year = 1957, + volume = 3 +} + +@Book{doe+rocket, + author = "John Doe and Rocket Scientist", + title = "Theory of Splits", + publisher = "Dover", + year = 1964, + address = "New York City", + edition = "ninth Dover printing, tenth GPO printing" +} + +@booklet{tiny_collect, + title = "A tiny collection of stuff" +} + +@conference{1960-rocket_deep-exploration, + author = "Rocket Scientist", + title = "Deep Exploration of the Singleton Split", + booktitle = "34th international conference of important stuff (ICIS)", + year = 1960 +} + +@inbook{1961-doe-diverse_splits, + author = "John Doe", + title = "A comprehensive list of splits", + pages = {13-39}, + publisher = "Penguin Books", + year = 1961 +} + +@incollection{1960-doe-location_splits, + author = "John Doe", + title = "Survey of location splits", + booktitle = "Current state of the art in computational methods", + publisher = "Penguin Books", + year = 1960 +} + +@INPROCEEDINGS{1959-rocket_exploration, + author = "Rocket Scientist", + title = "Exploration of the + Location-Singleton Split", + booktitle = "33th international conference of important stuff (ICIS)", + year = 1959 +} + +@manual{man_loc_splits, + title = "Introduction to Location Splits" +} + +@mastersthesis{doe_mastersth, + author = "John Doe" + title = "Evaluating location splits under identity constraints", + school = "School of Computer Science", + year = 1955 +} + +@misc{doe_mastersth_data, + author = "John Doe" + title = "Dataset of location splits under identity constraints", + howpublished = "http://johndoe.edu/masterthesis/data", + year = 1955 +} + +@phdthesis{doe_phd, + author = "John Doe" + title = "Evaluating location splits under diverse constraints", + school = "School of Computer Science", + year = 1958 +} + +@proceedings{1960_conf_splits, + title = "First international conference of splits", + year = 1960 +} + +@techreport{1961_splits +, author = "Rocket Scientist" +, title = "An introduction to advanced splits" +, institution = "School of Engineering" +, year = 1961 +} + +@unpublished{xx_thoughts, + author = "John Doe and Rocket Scientist", + title = "Thoughts on the future of splits", + note = "Heavily thought about" +} diff --git a/docs/news.rst b/docs/news.rst index 506968e41b..fe199f9747 100644 --- a/docs/news.rst +++ b/docs/news.rst @@ -42,6 +42,7 @@ The following parsers have been added: * Autoconf * Automake * AutoIt +* BibTex * Clojure * CMake *optlib* * CSS diff --git a/main/parsers_p.h b/main/parsers_p.h index 15aca2c1b2..fbb8120847 100644 --- a/main/parsers_p.h +++ b/main/parsers_p.h @@ -54,6 +54,7 @@ AwkParser, \ BasicParser, \ BetaParser, \ + BibtexParser, \ ClojureParser, \ CMakeParser, \ CParser, \ diff --git a/parsers/bibtex.c b/parsers/bibtex.c new file mode 100644 index 0000000000..6656992b4a --- /dev/null +++ b/parsers/bibtex.c @@ -0,0 +1,425 @@ +/* + * Copyright (c) 2008, David Fishburn + * Copyright (c) 2012, Jan Larres + * Copyright (c) 2019, Mirco Schönfeld + * + * This source code is released for free distribution under the terms of the + * GNU General Public License version 2 or (at your option) any later version. + * + * This module contains functions for generating identifiers of entries of Bibtex language files. + * + * BibTex language "reference": + * https://en.wikipedia.org/wiki/BibTeX + */ + +/* + * INCLUDE FILES + */ +#include "general.h" /* must always come first */ +#include /* to define isalpha () */ +#include + +#include "debug.h" +#include "entry.h" +#include "keyword.h" +#include "parse.h" +#include "read.h" +#include "routines.h" +#include "vstring.h" + +/* + * MACROS + */ +#define isType(token,t) (bool) ((token)->type == (t)) +#define isKeyword(token,k) (bool) ((token)->keyword == (k)) +#define isIdentChar(c) \ + (isalpha (c) || isdigit (c) || (c) == '_' || (c) == '-' || (c) == '+') + +/* + * DATA DECLARATIONS + */ + +/* + * Used to specify type of keyword. + */ +enum eKeywordId { + KEYWORD_article, + KEYWORD_book, + KEYWORD_booklet, + KEYWORD_conference, + KEYWORD_inbook, + KEYWORD_incollection, + KEYWORD_inproceedings, + KEYWORD_manual, + KEYWORD_mastersthesis, + KEYWORD_misc, + KEYWORD_phdthesis, + KEYWORD_proceedings, + KEYWORD_techreport, + KEYWORD_unpublished +}; +typedef int keywordId; /* to allow KEYWORD_NONE */ + +enum eTokenType { + /* 0..255 are the byte's value. Some are named for convenience */ + TOKEN_OPEN_CURLY = '{', + /* above is special types */ + TOKEN_UNDEFINED = 256, + TOKEN_KEYWORD, + TOKEN_IDENTIFIER +}; +typedef int tokenType; + +typedef struct sTokenInfo { + tokenType type; + keywordId keyword; + vString * string; + unsigned long lineNumber; + MIOPos filePosition; +} tokenInfo; + +/* + * DATA DEFINITIONS + */ + +static langType Lang_bib; + +typedef enum { + BIBTAG_ARTICLE, + BIBTAG_BOOK, + BIBTAG_BOOKLET, + BIBTAG_CONFERENCE, + BIBTAG_INBOOK, + BIBTAG_INCOLLECTION, + BIBTAG_INPROCEEDINGS, + BIBTAG_MANUAL, + BIBTAG_MASTERSTHESIS, + BIBTAG_MISC, + BIBTAG_PHDTHESIS, + BIBTAG_PROCEEDINGS, + BIBTAG_TECHREPORT, + BIBTAG_UNPUBLISHED, + BIBTAG_COUNT +} bibKind; + +static kindDefinition BibKinds [] = { + { true, 'a', "article", "article" }, + { true, 'b', "book", "book" }, + { true, 'B', "booklet", "booklet" }, + { true, 'c', "conference", "conference" }, + { true, 'i', "inbook", "inbook" }, + { true, 'I', "incollection", "incollection" }, + { true, 'j', "inproceedings", "inproceedings" }, + { true, 'm', "manual", "manual" }, + { true, 'M', "mastersthesis", "mastersthesis" }, + { true, 'n', "misc", "misc" }, + { true, 'p', "phdthesis", "phdthesis" }, + { true, 'P', "proceedings", "proceedings" }, + { true, 't', "techreport", "techreport" }, + { true, 'u', "unpublished", "unpublished" } +}; + +static const keywordTable BibKeywordTable [] = { + /* keyword keyword ID */ + { "article", KEYWORD_article }, + { "book", KEYWORD_book }, + { "booklet", KEYWORD_booklet }, + { "conference", KEYWORD_conference }, + { "inbook", KEYWORD_inbook }, + { "incollection", KEYWORD_incollection }, + { "inproceedings",KEYWORD_inproceedings }, + { "manual", KEYWORD_manual }, + { "mastersthesis",KEYWORD_mastersthesis }, + { "misc", KEYWORD_misc }, + { "phdthesis", KEYWORD_phdthesis }, + { "proceedings", KEYWORD_proceedings }, + { "techreport", KEYWORD_techreport }, + { "unpublished", KEYWORD_unpublished } +}; + +/* + * FUNCTION DEFINITIONS + */ + +static tokenInfo *newToken (void) +{ + tokenInfo *const token = xMalloc (1, tokenInfo); + + token->type = TOKEN_UNDEFINED; + token->keyword = KEYWORD_NONE; + token->string = vStringNew (); + token->lineNumber = getInputLineNumber (); + token->filePosition = getInputFilePosition (); + + return token; +} + +static void deleteToken (tokenInfo *const token) +{ + vStringDelete (token->string); + eFree (token); +} + +/* + * Tag generation functions + */ +static void makeBibTag (tokenInfo *const token, bibKind kind) +{ + if (BibKinds [kind].enabled) + { + const char *const name = vStringValue (token->string); + tagEntryInfo e; + initTagEntry (&e, name, kind); + + e.lineNumber = token->lineNumber; + e.filePosition = token->filePosition; + + makeTagEntry (&e); + } +} + +/* + * Parsing functions + */ + +/* + * Read a C identifier beginning with "firstChar" and places it into + * "name". + */ +static void parseIdentifier (vString *const string, const int firstChar) +{ + int c = firstChar; + Assert (isIdentChar (c)); + do + { + vStringPut (string, c); + c = getcFromInputFile (); + } while (c != EOF && isIdentChar (c)); + if (c != EOF) + ungetcToInputFile (c); /* unget non-identifier character */ +} + +static bool readToken (tokenInfo *const token) +{ + int c; + + token->type = TOKEN_UNDEFINED; + token->keyword = KEYWORD_NONE; + vStringClear (token->string); + +getNextChar: + + do + { + c = getcFromInputFile (); + } + while (c == '\t' || c == ' ' || c == '\n'); + + token->lineNumber = getInputLineNumber (); + token->filePosition = getInputFilePosition (); + + token->type = (unsigned char) c; + switch (c) + { + case EOF: return false; + + case '@': + /* + * All Bib entries start with an at symbol. + * Check if the next character is an alpha character + * else it is not a potential tex tag. + */ + c = getcFromInputFile (); + if (! isalpha (c)) + ungetcToInputFile (c); + else + { + vStringPut (token->string, '@'); + parseIdentifier (token->string, c); + token->keyword = lookupCaseKeyword (vStringValue (token->string) + 1, Lang_bib); + if (isKeyword (token, KEYWORD_NONE)) + token->type = TOKEN_IDENTIFIER; + else + token->type = TOKEN_KEYWORD; + } + break; + case '%': + skipToCharacterInInputFile ('\n'); /* % are single line comments */ + goto getNextChar; + break; + default: + if (isIdentChar (c)) + { + parseIdentifier (token->string, c); + token->type = TOKEN_IDENTIFIER; + } + break; + } + return true; +} + +static void copyToken (tokenInfo *const dest, tokenInfo *const src) +{ + dest->lineNumber = src->lineNumber; + dest->filePosition = src->filePosition; + dest->type = src->type; + dest->keyword = src->keyword; + vStringCopy (dest->string, src->string); +} + +/* + * Scanning functions + */ + +static bool parseTag (tokenInfo *const token, bibKind kind) +{ + tokenInfo * const name = newToken (); + vString * currentid; + bool eof = false; + + currentid = vStringNew (); + /* + * Bib entries are of these formats: + * @article{identifier, + * author="John Doe"} + * + * When a keyword is found, loop through all words up to + * a comma brace for the tag name. + * + */ + if (isType (token, TOKEN_KEYWORD)) + { + copyToken (name, token); + if (!readToken (token)) + { + eof = true; + goto out; + } + } + + if (isType (token, TOKEN_OPEN_CURLY)) + { + if (!readToken (token)) + { + eof = true; + goto out; + } + if (isType (token, TOKEN_IDENTIFIER)){ + vStringCat (currentid, token->string); + vStringStripTrailing (currentid); + if (vStringLength (currentid) > 0) + { + vStringCopy (name->string, currentid); + makeBibTag (name, kind); + } + } + else + { // should find an identifier for bib item at first place + eof = true; + goto out; + } + } + + + out: + deleteToken (name); + vStringDelete (currentid); + return eof; +} + +static void parseBibFile (tokenInfo *const token) +{ + bool eof = false; + + do + { + if (!readToken (token)) + break; + + if (isType (token, TOKEN_KEYWORD)) + { + switch (token->keyword) + { + case KEYWORD_article: + eof = parseTag (token, BIBTAG_ARTICLE); + break; + case KEYWORD_book: + eof = parseTag (token, BIBTAG_BOOK); + break; + case KEYWORD_booklet: + eof = parseTag (token, BIBTAG_BOOKLET); + break; + case KEYWORD_conference: + eof = parseTag (token, BIBTAG_CONFERENCE); + break; + case KEYWORD_inbook: + eof = parseTag (token, BIBTAG_INBOOK); + break; + case KEYWORD_incollection: + eof = parseTag (token, BIBTAG_INCOLLECTION); + break; + case KEYWORD_inproceedings: + eof = parseTag (token, BIBTAG_INPROCEEDINGS); + break; + case KEYWORD_manual: + eof = parseTag (token, BIBTAG_MANUAL); + break; + case KEYWORD_mastersthesis: + eof = parseTag (token, BIBTAG_MASTERSTHESIS); + break; + case KEYWORD_misc: + eof = parseTag (token, BIBTAG_MISC); + break; + case KEYWORD_phdthesis: + eof = parseTag (token, BIBTAG_PHDTHESIS); + break; + case KEYWORD_proceedings: + eof = parseTag (token, BIBTAG_PROCEEDINGS); + break; + case KEYWORD_techreport: + eof = parseTag (token, BIBTAG_TECHREPORT); + break; + case KEYWORD_unpublished: + eof = parseTag (token, BIBTAG_UNPUBLISHED); + break; + default: + break; + } + } + if (eof) + break; + } while (true); +} + +static void initialize (const langType language) +{ + Lang_bib = language; +} + +static void findBibTags (void) +{ + tokenInfo *const token = newToken (); + + parseBibFile (token); + + deleteToken (token); +} + +/* Create parser definition structure */ +extern parserDefinition* BibtexParser (void) +{ + Assert (ARRAY_SIZE (BibKinds) == BIBTAG_COUNT); + static const char *const extensions [] = { "bib", NULL }; + parserDefinition *const def = parserNew ("BibTex"); + def->extensions = extensions; + /* + * New definitions for parsing instead of regex + */ + def->kindTable = BibKinds; + def->kindCount = ARRAY_SIZE (BibKinds); + def->parser = findBibTags; + def->initialize = initialize; + def->keywordTable = BibKeywordTable; + def->keywordCount = ARRAY_SIZE (BibKeywordTable); + return def; +} diff --git a/source.mak b/source.mak index 44be8bd917..8e3d3dedc1 100644 --- a/source.mak +++ b/source.mak @@ -195,6 +195,7 @@ PARSER_SRCS = \ parsers/awk.c \ parsers/basic.c \ parsers/beta.c \ + parsers/bibtex.c \ parsers/c.c \ parsers/clojure.c \ parsers/css.c \ diff --git a/win32/ctags_vs2013.vcxproj b/win32/ctags_vs2013.vcxproj index 252c84a1b4..12a7ca58cf 100644 --- a/win32/ctags_vs2013.vcxproj +++ b/win32/ctags_vs2013.vcxproj @@ -167,6 +167,7 @@ + diff --git a/win32/ctags_vs2013.vcxproj.filters b/win32/ctags_vs2013.vcxproj.filters index 01e230db73..96d98ccf14 100644 --- a/win32/ctags_vs2013.vcxproj.filters +++ b/win32/ctags_vs2013.vcxproj.filters @@ -255,6 +255,9 @@ Source Files\Parsers + + Source Files\Parsers + Source Files\Parsers