From 68f6cb9e5f93806d83d3ebfd971ee27986a0c9ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mirco=20Sch=C3=B6nfeld?= <TwlyY29@gmail.com>
Date: Thu, 11 Jul 2019 21:43:02 +0200
Subject: [PATCH] BibTex: new parser (#2137)

added a bibtex parser that extracts identifiers of entries in bib-files
---
 .../bib-simple.d/expected.tags                |  14 +
 Units/parser-bibtex.r/bib-simple.d/input.bib  |  97 ++++
 docs/news.rst                                 |   1 +
 main/parsers_p.h                              |   1 +
 parsers/bibtex.c                              | 425 ++++++++++++++++++
 source.mak                                    |   1 +
 win32/ctags_vs2013.vcxproj                    |   1 +
 win32/ctags_vs2013.vcxproj.filters            |   3 +
 8 files changed, 543 insertions(+)
 create mode 100644 Units/parser-bibtex.r/bib-simple.d/expected.tags
 create mode 100644 Units/parser-bibtex.r/bib-simple.d/input.bib
 create mode 100644 parsers/bibtex.c

diff --git a/Units/parser-bibtex.r/bib-simple.d/expected.tags b/Units/parser-bibtex.r/bib-simple.d/expected.tags
new file mode 100644
index 0000000000..3717d74cb2
--- /dev/null
+++ b/Units/parser-bibtex.r/bib-simple.d/expected.tags
@@ -0,0 +1,14 @@
+1957-doe_loc_ident	input.bib	/^@article{1957-doe_loc_ident,$/;"	a
+1959-rocket_exploration	input.bib	/^@INPROCEEDINGS{1959-rocket_exploration,$/;"	j
+1960-doe-location_splits	input.bib	/^@incollection{1960-doe-location_splits,$/;"	I
+1960-rocket_deep-exploration	input.bib	/^@conference{1960-rocket_deep-exploration,$/;"	c
+1960_conf_splits	input.bib	/^@proceedings{1960_conf_splits,$/;"	P
+1961-doe-diverse_splits	input.bib	/^@inbook{1961-doe-diverse_splits,$/;"	i
+1961_splits	input.bib	/^@techreport{1961_splits$/;"	t
+doe+rocket	input.bib	/^@Book{doe+rocket,$/;"	b
+doe_mastersth	input.bib	/^@mastersthesis{doe_mastersth,$/;"	M
+doe_mastersth_data	input.bib	/^@misc{doe_mastersth_data,$/;"	n
+doe_phd	input.bib	/^@phdthesis{doe_phd,$/;"	p
+man_loc_splits	input.bib	/^@manual{man_loc_splits,$/;"	m
+tiny_collect	input.bib	/^@booklet{tiny_collect,$/;"	B
+xx_thoughts	input.bib	/^@unpublished{xx_thoughts,$/;"	u
diff --git a/Units/parser-bibtex.r/bib-simple.d/input.bib b/Units/parser-bibtex.r/bib-simple.d/input.bib
new file mode 100644
index 0000000000..f5256d00b0
--- /dev/null
+++ b/Units/parser-bibtex.r/bib-simple.d/input.bib
@@ -0,0 +1,97 @@
+% this is a comment that will be ignored
+
+@article{1957-doe_loc_ident,
+ author    = "John Doe,
+ title     = "Exploration of the 
+              Location-Identity Split",
+ journal   = "Journal of Splits",
+ year      =  1957,
+ volume    =  3
+}
+
+@Book{doe+rocket,
+ author    = "John Doe and Rocket Scientist",
+ title     = "Theory of Splits",
+ publisher = "Dover",
+ year      =  1964,
+ address   = "New York City",
+ edition   = "ninth Dover printing, tenth GPO printing"
+}
+
+@booklet{tiny_collect,
+  title    = "A tiny collection of stuff"
+}
+
+@conference{1960-rocket_deep-exploration,
+ author    = "Rocket Scientist",
+ title     = "Deep Exploration of the Singleton Split",
+ booktitle = "34th international conference of important stuff (ICIS)",
+ year      =  1960
+}
+
+@inbook{1961-doe-diverse_splits,
+  author    = "John Doe",
+  title     = "A comprehensive list of splits",
+  pages     = {13-39},
+  publisher = "Penguin Books",
+  year      = 1961
+}
+
+@incollection{1960-doe-location_splits,
+  author    = "John Doe",
+  title     = "Survey of location splits",
+  booktitle = "Current state of the art in computational methods",
+  publisher = "Penguin Books",
+  year      = 1960
+}
+
+@INPROCEEDINGS{1959-rocket_exploration,
+ author    = "Rocket Scientist",
+ title     = "Exploration of the 
+              Location-Singleton Split",
+ booktitle = "33th international conference of important stuff (ICIS)",
+ year      =  1959
+}
+
+@manual{man_loc_splits,
+  title  = "Introduction to Location Splits"
+}
+
+@mastersthesis{doe_mastersth,
+  author = "John Doe"
+  title  = "Evaluating location splits under identity constraints",
+  school = "School of Computer Science",
+  year   = 1955
+}
+
+@misc{doe_mastersth_data,
+  author       = "John Doe"
+  title        = "Dataset of location splits under identity constraints",
+  howpublished = "http://johndoe.edu/masterthesis/data",
+  year         = 1955
+}
+
+@phdthesis{doe_phd,
+  author = "John Doe"
+  title  = "Evaluating location splits under diverse constraints",
+  school = "School of Computer Science",
+  year   = 1958
+}
+
+@proceedings{1960_conf_splits,
+  title = "First international conference of splits",
+  year  = 1960
+}
+
+@techreport{1961_splits
+, author      = "Rocket Scientist"
+, title       = "An introduction to advanced splits"
+, institution = "School of Engineering"
+, year        = 1961
+}
+
+@unpublished{xx_thoughts,
+  author = "John Doe and Rocket Scientist",
+  title  = "Thoughts on the future of splits",
+  note   = "Heavily thought about"
+}
diff --git a/docs/news.rst b/docs/news.rst
index 506968e41b..fe199f9747 100644
--- a/docs/news.rst
+++ b/docs/news.rst
@@ -42,6 +42,7 @@ The following parsers have been added:
 * Autoconf
 * Automake
 * AutoIt
+* BibTex
 * Clojure
 * CMake *optlib*
 * CSS
diff --git a/main/parsers_p.h b/main/parsers_p.h
index 15aca2c1b2..fbb8120847 100644
--- a/main/parsers_p.h
+++ b/main/parsers_p.h
@@ -54,6 +54,7 @@
 	AwkParser, \
 	BasicParser, \
 	BetaParser, \
+  	BibtexParser, \
 	ClojureParser, \
 	CMakeParser, \
 	CParser, \
diff --git a/parsers/bibtex.c b/parsers/bibtex.c
new file mode 100644
index 0000000000..6656992b4a
--- /dev/null
+++ b/parsers/bibtex.c
@@ -0,0 +1,425 @@
+/*
+ *	 Copyright (c) 2008, David Fishburn
+ *	 Copyright (c) 2012, Jan Larres
+ *	 Copyright (c) 2019, Mirco Schönfeld
+ *
+ *	 This source code is released for free distribution under the terms of the
+ *	 GNU General Public License version 2 or (at your option) any later version.
+ *
+ *	 This module contains functions for generating identifiers of entries of Bibtex language files.
+ *
+ *	 BibTex language "reference":
+ *		 https://en.wikipedia.org/wiki/BibTeX
+ */
+
+/*
+ *	 INCLUDE FILES
+ */
+#include "general.h"	/* must always come first */
+#include <ctype.h>	/* to define isalpha () */
+#include <string.h>
+
+#include "debug.h"
+#include "entry.h"
+#include "keyword.h"
+#include "parse.h"
+#include "read.h"
+#include "routines.h"
+#include "vstring.h"
+
+/*
+ *	 MACROS
+ */
+#define isType(token,t)		(bool) ((token)->type == (t))
+#define isKeyword(token,k)	(bool) ((token)->keyword == (k))
+#define isIdentChar(c) \
+	(isalpha (c) || isdigit (c) || (c) == '_' || (c) == '-' || (c) == '+')
+
+/*
+ *	 DATA DECLARATIONS
+ */
+
+/*
+ * Used to specify type of keyword.
+ */
+enum eKeywordId {
+	KEYWORD_article,
+	KEYWORD_book,
+	KEYWORD_booklet,
+	KEYWORD_conference,
+	KEYWORD_inbook,
+	KEYWORD_incollection,
+	KEYWORD_inproceedings,
+	KEYWORD_manual,
+	KEYWORD_mastersthesis,
+	KEYWORD_misc,
+	KEYWORD_phdthesis,
+	KEYWORD_proceedings,
+	KEYWORD_techreport,
+	KEYWORD_unpublished
+};
+typedef int keywordId; /* to allow KEYWORD_NONE */
+
+enum eTokenType {
+	/* 0..255 are the byte's value.  Some are named for convenience */
+	TOKEN_OPEN_CURLY = '{',
+	/* above is special types */
+	TOKEN_UNDEFINED = 256,
+	TOKEN_KEYWORD,
+	TOKEN_IDENTIFIER
+};
+typedef int tokenType;
+
+typedef struct sTokenInfo {
+	tokenType		type;
+	keywordId		keyword;
+	vString *		string;
+	unsigned long 	lineNumber;
+	MIOPos 			filePosition;
+} tokenInfo;
+
+/*
+ *	DATA DEFINITIONS
+ */
+
+static langType Lang_bib;
+
+typedef enum {
+	BIBTAG_ARTICLE,
+	BIBTAG_BOOK,
+	BIBTAG_BOOKLET,
+	BIBTAG_CONFERENCE,
+	BIBTAG_INBOOK,
+	BIBTAG_INCOLLECTION,
+	BIBTAG_INPROCEEDINGS,
+	BIBTAG_MANUAL,
+	BIBTAG_MASTERSTHESIS,
+	BIBTAG_MISC,
+	BIBTAG_PHDTHESIS,
+	BIBTAG_PROCEEDINGS,
+	BIBTAG_TECHREPORT,
+	BIBTAG_UNPUBLISHED,
+	BIBTAG_COUNT
+} bibKind;
+
+static kindDefinition BibKinds [] = {
+	{ true,  'a', "article",				"article"				},
+	{ true,  'b', "book",						"book"					},
+	{ true,  'B', "booklet",				"booklet"				},
+	{ true,  'c', "conference",			"conference"		},
+	{ true,  'i', "inbook",					"inbook"				},
+	{ true,  'I', "incollection",		"incollection"	},
+	{ true,  'j', "inproceedings",	"inproceedings"	},
+	{ true,  'm', "manual",					"manual"				},
+	{ true,  'M', "mastersthesis",	"mastersthesis"	},
+	{ true,  'n', "misc",						"misc"					},
+	{ true,  'p', "phdthesis",			"phdthesis"			},
+	{ true,  'P', "proceedings",		"proceedings"		},
+	{ true,  't', "techreport",			"techreport"		},
+	{ true,  'u', "unpublished",		"unpublished"		}
+};
+
+static const keywordTable BibKeywordTable [] = {
+	/* keyword			  keyword ID */
+	{ "article",	    KEYWORD_article				},
+	{ "book",	        KEYWORD_book				  },
+	{ "booklet",	    KEYWORD_booklet				},
+	{ "conference",	  KEYWORD_conference		},
+	{ "inbook",	      KEYWORD_inbook				},
+	{ "incollection",	KEYWORD_incollection	},
+	{ "inproceedings",KEYWORD_inproceedings	},
+	{ "manual",	      KEYWORD_manual				},
+	{ "mastersthesis",KEYWORD_mastersthesis	},
+	{ "misc",	        KEYWORD_misc				  },
+	{ "phdthesis",	  KEYWORD_phdthesis			},
+	{ "proceedings",	KEYWORD_proceedings		},
+	{ "techreport",	  KEYWORD_techreport		},
+	{ "unpublished",	KEYWORD_unpublished		}
+};
+  
+/*
+ *	 FUNCTION DEFINITIONS
+ */
+
+static tokenInfo *newToken (void)
+{
+	tokenInfo *const token = xMalloc (1, tokenInfo);
+
+	token->type			= TOKEN_UNDEFINED;
+	token->keyword		= KEYWORD_NONE;
+	token->string		= vStringNew ();
+	token->lineNumber   = getInputLineNumber ();
+	token->filePosition = getInputFilePosition ();
+
+	return token;
+}
+
+static void deleteToken (tokenInfo *const token)
+{
+	vStringDelete (token->string);
+	eFree (token);
+}
+
+/*
+ *	 Tag generation functions
+ */
+static void makeBibTag (tokenInfo *const token, bibKind kind)
+{
+	if (BibKinds [kind].enabled)
+	{
+		const char *const name = vStringValue (token->string);
+		tagEntryInfo e;
+		initTagEntry (&e, name, kind);
+
+		e.lineNumber   = token->lineNumber;
+		e.filePosition = token->filePosition;
+
+		makeTagEntry (&e);
+	}
+}
+
+/*
+ *	 Parsing functions
+ */
+
+/*
+ *	Read a C identifier beginning with "firstChar" and places it into
+ *	"name".
+ */
+static void parseIdentifier (vString *const string, const int firstChar)
+{
+	int c = firstChar;
+	Assert (isIdentChar (c));
+	do
+	{
+		vStringPut (string, c);
+		c = getcFromInputFile ();
+	} while (c != EOF && isIdentChar (c));
+	if (c != EOF)
+		ungetcToInputFile (c);		/* unget non-identifier character */
+}
+
+static bool readToken (tokenInfo *const token)
+{
+	int c;
+
+	token->type			= TOKEN_UNDEFINED;
+	token->keyword		= KEYWORD_NONE;
+	vStringClear (token->string);
+
+getNextChar:
+
+	do
+	{
+		c = getcFromInputFile ();
+	}
+	while (c == '\t' || c == ' ' || c == '\n');
+
+	token->lineNumber   = getInputLineNumber ();
+	token->filePosition = getInputFilePosition ();
+
+	token->type = (unsigned char) c;
+	switch (c)
+	{
+		case EOF: return false;
+
+		case '@':
+					/*
+					 * All Bib entries start with an at symbol.
+					 * Check if the next character is an alpha character
+					 * else it is not a potential tex tag.
+					 */
+					c = getcFromInputFile ();
+					if (! isalpha (c))
+					  ungetcToInputFile (c);
+					else
+					{
+						vStringPut (token->string, '@');
+						parseIdentifier (token->string, c);
+						token->keyword = lookupCaseKeyword (vStringValue (token->string) + 1, Lang_bib);
+						if (isKeyword (token, KEYWORD_NONE))
+							token->type = TOKEN_IDENTIFIER;
+						else
+							token->type = TOKEN_KEYWORD;
+					}
+					break;
+		case '%':
+					skipToCharacterInInputFile ('\n'); /* % are single line comments */
+					goto getNextChar;
+					break;
+		default:
+					if (isIdentChar (c))
+					{
+						parseIdentifier (token->string, c);
+						token->type = TOKEN_IDENTIFIER;
+					}
+					break;
+	}
+	return true;
+}
+
+static void copyToken (tokenInfo *const dest, tokenInfo *const src)
+{
+	dest->lineNumber = src->lineNumber;
+	dest->filePosition = src->filePosition;
+	dest->type = src->type;
+	dest->keyword = src->keyword;
+	vStringCopy (dest->string, src->string);
+}
+
+/*
+ *	 Scanning functions
+ */
+
+static bool parseTag (tokenInfo *const token, bibKind kind)
+{
+	tokenInfo *	const name = newToken ();
+	vString *		currentid;
+	bool				eof = false;
+
+	currentid = vStringNew ();
+	/*
+	 * Bib entries are of these formats:
+	 *   @article{identifier,
+	 *   author="John Doe"}
+	 *
+	 * When a keyword is found, loop through all words up to
+	 * a comma brace for the tag name.
+	 *
+	 */
+	if (isType (token, TOKEN_KEYWORD))
+	{
+		copyToken (name, token);
+		if (!readToken (token))
+		{
+			eof = true;
+			goto out;
+		}
+	}
+
+	if (isType (token, TOKEN_OPEN_CURLY))
+	{
+		if (!readToken (token))
+		{
+			eof = true;
+			goto out;
+		}
+		if (isType (token, TOKEN_IDENTIFIER)){
+			vStringCat (currentid, token->string);
+			vStringStripTrailing (currentid);
+			if (vStringLength (currentid) > 0)
+			{
+				vStringCopy (name->string, currentid);
+				makeBibTag (name, kind);
+			}
+		}
+		else
+		{ // should find an identifier for bib item at first place
+			eof = true;
+			goto out;
+		}
+	}
+  
+
+ out:
+	deleteToken (name);
+	vStringDelete (currentid);
+	return eof;
+}
+
+static void parseBibFile (tokenInfo *const token)
+{
+	bool eof = false;
+
+	do
+	{
+		if (!readToken (token))
+			break;
+
+		if (isType (token, TOKEN_KEYWORD))
+		{
+			switch (token->keyword)
+			{
+				case KEYWORD_article:
+					eof = parseTag (token, BIBTAG_ARTICLE);
+					break;
+				case KEYWORD_book:
+					eof = parseTag (token, BIBTAG_BOOK);
+					break;
+				case KEYWORD_booklet:
+					eof = parseTag (token, BIBTAG_BOOKLET);
+					break;
+				case KEYWORD_conference:
+					eof = parseTag (token, BIBTAG_CONFERENCE);
+					break;
+				case KEYWORD_inbook:
+					eof = parseTag (token, BIBTAG_INBOOK);
+					break;
+				case KEYWORD_incollection:
+					eof = parseTag (token, BIBTAG_INCOLLECTION);
+					break;
+				case KEYWORD_inproceedings:
+					eof = parseTag (token, BIBTAG_INPROCEEDINGS);
+					break;
+				case KEYWORD_manual:
+					eof = parseTag (token, BIBTAG_MANUAL);
+					break;
+				case KEYWORD_mastersthesis:
+					eof = parseTag (token, BIBTAG_MASTERSTHESIS);
+					break;
+				case KEYWORD_misc:
+					eof = parseTag (token, BIBTAG_MISC);
+					break;
+				case KEYWORD_phdthesis:
+					eof = parseTag (token, BIBTAG_PHDTHESIS);
+					break;
+				case KEYWORD_proceedings:
+					eof = parseTag (token, BIBTAG_PROCEEDINGS);
+					break;
+				case KEYWORD_techreport:
+					eof = parseTag (token, BIBTAG_TECHREPORT);
+					break;
+				case KEYWORD_unpublished:
+					eof = parseTag (token, BIBTAG_UNPUBLISHED);
+					break;
+				default:
+					break;
+			}
+		}
+		if (eof)
+			break;
+	} while (true);
+}
+
+static void initialize (const langType language)
+{
+	Lang_bib = language;
+}
+
+static void findBibTags (void)
+{
+	tokenInfo *const token = newToken ();
+
+	parseBibFile (token);
+
+	deleteToken (token);
+}
+
+/* Create parser definition structure */
+extern parserDefinition* BibtexParser (void)
+{
+	Assert (ARRAY_SIZE (BibKinds) == BIBTAG_COUNT);
+	static const char *const extensions [] = { "bib", NULL };
+	parserDefinition *const def = parserNew ("BibTex");
+	def->extensions = extensions;
+	/*
+	 * New definitions for parsing instead of regex
+	 */
+	def->kindTable		= BibKinds;
+	def->kindCount		= ARRAY_SIZE (BibKinds);
+	def->parser				= findBibTags;
+	def->initialize		= initialize;
+	def->keywordTable	= BibKeywordTable;
+	def->keywordCount	= ARRAY_SIZE (BibKeywordTable);
+	return def;
+}
diff --git a/source.mak b/source.mak
index 44be8bd917..8e3d3dedc1 100644
--- a/source.mak
+++ b/source.mak
@@ -195,6 +195,7 @@ PARSER_SRCS =				\
 	parsers/awk.c			\
 	parsers/basic.c			\
 	parsers/beta.c			\
+	parsers/bibtex.c			\
 	parsers/c.c			\
 	parsers/clojure.c		\
 	parsers/css.c			\
diff --git a/win32/ctags_vs2013.vcxproj b/win32/ctags_vs2013.vcxproj
index 252c84a1b4..12a7ca58cf 100644
--- a/win32/ctags_vs2013.vcxproj
+++ b/win32/ctags_vs2013.vcxproj
@@ -167,6 +167,7 @@
     <ClCompile Include="..\parsers\awk.c" />
     <ClCompile Include="..\parsers\basic.c" />
     <ClCompile Include="..\parsers\beta.c" />
+    <ClCompile Include="..\parsers\bibtex.c" />
     <ClCompile Include="..\parsers\c.c" />
     <ClCompile Include="..\parsers\clojure.c" />
     <ClCompile Include="..\parsers\cobol.c" />
diff --git a/win32/ctags_vs2013.vcxproj.filters b/win32/ctags_vs2013.vcxproj.filters
index 01e230db73..96d98ccf14 100644
--- a/win32/ctags_vs2013.vcxproj.filters
+++ b/win32/ctags_vs2013.vcxproj.filters
@@ -255,6 +255,9 @@
     <ClCompile Include="..\parsers\beta.c">
       <Filter>Source Files\Parsers</Filter>
     </ClCompile>
+    <ClCompile Include="..\parsers\bibtex.c">
+      <Filter>Source Files\Parsers</Filter>
+    </ClCompile>
     <ClCompile Include="..\parsers\c.c">
       <Filter>Source Files\Parsers</Filter>
     </ClCompile>