diff --git a/ctags/Makefile.am b/ctags/Makefile.am index ee2e283102..a06490168b 100644 --- a/ctags/Makefile.am +++ b/ctags/Makefile.am @@ -82,6 +82,7 @@ parsers = \ parsers/geany_matlab.c \ parsers/nsis.c \ parsers/objc.c \ + parsers/ocaml.c \ parsers/pascal.c \ parsers/perl.c \ parsers/perl.h \ diff --git a/ctags/parsers/ocaml.c b/ctags/parsers/ocaml.c new file mode 100644 index 0000000000..d4af643d96 --- /dev/null +++ b/ctags/parsers/ocaml.c @@ -0,0 +1,2094 @@ +/* +* Copyright (c) 2009, Vincent Berthoux +* +* This source code is released for free distribution under the terms of the +* GNU General Public License version 2 or (at your option) any later version. +* +* This module contains functions for generating tags for Objective Caml +* language files. +*/ +/* +* INCLUDE FILES +*/ +#include "general.h" /* must always come first */ + +#include + +#include "debug.h" +#include "entry.h" +#include "keyword.h" +#include "options.h" +#include "parse.h" +#include "read.h" +#include "routines.h" +#include "vstring.h" + +#define OCAML_MAX_STACK_SIZE 256 + +typedef enum { + K_CLASS, /* OCaml class, relatively rare */ + K_METHOD, /* class method */ + K_MODULE, /* OCaml module OR functor */ + K_VARIABLE, + K_VAL, + K_TYPE, /* name of an OCaml type */ + K_FUNCTION, + K_CONSTRUCTOR, /* Constructor of a sum type */ + K_RECORDFIELD, + K_EXCEPTION, +} ocamlKind; + +static kindDefinition OcamlKinds[] = { + {true, 'c', "class", "classes"}, + {true, 'm', "method", "Object's method"}, + {true, 'M', "module", "Module or functor"}, + {true, 'v', "var", "Global variable"}, + {true, 'p', "val", "Signature item"}, + {true, 't', "type", "Type name"}, + {true, 'f', "function", "A function"}, + {true, 'C', "Constructor", "A constructor"}, + {true, 'r', "RecordField", "A 'structure' field"}, + {true, 'e', "Exception", "An exception"}, +}; + +typedef enum { + OcaKEYWORD_and, + OcaKEYWORD_begin, + OcaKEYWORD_class, + OcaKEYWORD_do, + OcaKEYWORD_done, + OcaKEYWORD_else, + OcaKEYWORD_end, + OcaKEYWORD_exception, + OcaKEYWORD_for, + OcaKEYWORD_functor, + OcaKEYWORD_fun, + OcaKEYWORD_function, + OcaKEYWORD_if, + OcaKEYWORD_in, + OcaKEYWORD_let, + OcaKEYWORD_value, + OcaKEYWORD_match, + OcaKEYWORD_method, + OcaKEYWORD_module, + OcaKEYWORD_mutable, + OcaKEYWORD_object, + OcaKEYWORD_of, + OcaKEYWORD_rec, + OcaKEYWORD_sig, + OcaKEYWORD_struct, + OcaKEYWORD_then, + OcaKEYWORD_try, + OcaKEYWORD_type, + OcaKEYWORD_val, + OcaKEYWORD_virtual, + OcaKEYWORD_while, + OcaKEYWORD_with, + + OcaIDENTIFIER, + Tok_PARL, /* '(' */ + Tok_PARR, /* ')' */ + Tok_BRL, /* '[' */ + Tok_BRR, /* ']' */ + Tok_CurlL, /* '{' */ + Tok_CurlR, /* '}' */ + Tok_Prime, /* '\'' */ + Tok_Pipe, /* '|' */ + Tok_EQ, /* '=' */ + Tok_Val, /* string/number/poo */ + Tok_Op, /* any operator recognized by the language */ + Tok_semi, /* ';' */ + Tok_comma, /* ',' */ + Tok_To, /* '->' */ + Tok_Of, /* ':' */ + Tok_Sharp, /* '#' */ + Tok_Backslash, /* '\\' */ + + Tok_EOF /* END of file */ +} ocamlKeyword; + +typedef struct sOcaKeywordDesc { + const char *name; + ocamlKeyword id; +} ocaKeywordDesc; + +typedef ocamlKeyword ocaToken; + +static const keywordTable OcamlKeywordTable[] = { + { "and" , OcaKEYWORD_and }, + { "begin" , OcaKEYWORD_begin }, + { "class" , OcaKEYWORD_class }, + { "do" , OcaKEYWORD_do }, + { "done" , OcaKEYWORD_done }, + { "else" , OcaKEYWORD_else }, + { "end" , OcaKEYWORD_end }, + { "exception" , OcaKEYWORD_exception }, + { "for" , OcaKEYWORD_for }, + { "fun" , OcaKEYWORD_fun }, + { "function" , OcaKEYWORD_fun }, + { "functor" , OcaKEYWORD_functor }, + { "if" , OcaKEYWORD_if }, + { "in" , OcaKEYWORD_in }, + { "let" , OcaKEYWORD_let }, + { "match" , OcaKEYWORD_match }, + { "method" , OcaKEYWORD_method }, + { "module" , OcaKEYWORD_module }, + { "mutable" , OcaKEYWORD_mutable }, + { "object" , OcaKEYWORD_object }, + { "of" , OcaKEYWORD_of }, + { "rec" , OcaKEYWORD_rec }, + { "sig" , OcaKEYWORD_sig }, + { "struct" , OcaKEYWORD_struct }, + { "then" , OcaKEYWORD_then }, + { "try" , OcaKEYWORD_try }, + { "type" , OcaKEYWORD_type }, + { "val" , OcaKEYWORD_val }, + { "value" , OcaKEYWORD_value }, /* just to handle revised syntax */ + { "virtual" , OcaKEYWORD_virtual }, + { "while" , OcaKEYWORD_while }, + { "with" , OcaKEYWORD_with }, + + { "or" , Tok_Op }, + { "mod " , Tok_Op }, + { "land " , Tok_Op }, + { "lor " , Tok_Op }, + { "lxor " , Tok_Op }, + { "lsl " , Tok_Op }, + { "lsr " , Tok_Op }, + { "asr" , Tok_Op }, + { "->" , Tok_To }, + { ":" , Tok_Of }, + { "true" , Tok_Val }, + { "false" , Tok_Val } +}; + +static langType Lang_Ocaml; + +static bool exportLocalInfo = false; + +/*////////////////////////////////////////////////////////////////// +//// lexingInit */ +typedef struct _lexingState { + vString *name; /* current parsed identifier/operator */ + const unsigned char *cp; /* position in stream */ +} lexingState; + +/* array of the size of all possible value for a char */ +static bool isOperator[1 << (8 * sizeof (char))] = { false }; + +/* definition of all the operator in OCaml, + * /!\ certain operator get special treatment + * in regards of their role in OCaml grammar : + * '|' ':' '=' '~' and '?' */ +static void initOperatorTable ( void ) +{ + isOperator['!'] = true; + isOperator['$'] = true; + isOperator['%'] = true; + isOperator['&'] = true; + isOperator['*'] = true; + isOperator['+'] = true; + isOperator['-'] = true; + isOperator['.'] = true; + isOperator['/'] = true; + isOperator[':'] = true; + isOperator['<'] = true; + isOperator['='] = true; + isOperator['>'] = true; + isOperator['?'] = true; + isOperator['@'] = true; + isOperator['^'] = true; + isOperator['~'] = true; + isOperator['|'] = true; +} + +/*////////////////////////////////////////////////////////////////////// +//// Lexing */ +static bool isNum (char c) +{ + return c >= '0' && c <= '9'; +} + +static bool isLowerAlpha (char c) +{ + return c >= 'a' && c <= 'z'; +} + +static bool isUpperAlpha (char c) +{ + return c >= 'A' && c <= 'Z'; +} + +static bool isAlpha (char c) +{ + return isLowerAlpha (c) || isUpperAlpha (c); +} + +static bool isIdent (char c) +{ + return isNum (c) || isAlpha (c) || c == '_' || c == '\''; +} + +static bool isSpace (char c) +{ + return c == ' ' || c == '\t' || c == '\r' || c == '\n'; +} + +static void eatWhiteSpace (lexingState * st) +{ + const unsigned char *cp = st->cp; + while (isSpace (*cp)) + cp++; + + st->cp = cp; +} + +static void eatString (lexingState * st) +{ + bool lastIsBackSlash = false; + bool unfinished = true; + const unsigned char *c = st->cp + 1; + + while (unfinished) + { + /* end of line should never happen. + * we tolerate it */ + if (c == NULL || c[0] == '\0') + break; + else if (*c == '"' && !lastIsBackSlash) + unfinished = false; + else + lastIsBackSlash = *c == '\\'; + + c++; + } + + st->cp = c; +} + +static void eatComment (lexingState * st) +{ + bool unfinished = true; + bool lastIsStar = false; + const unsigned char *c = st->cp + 2; + + while (unfinished) + { + /* we've reached the end of the line.. + * so we have to reload a line... */ + if (c == NULL || *c == '\0') + { + st->cp = readLineFromInputFile (); + /* WOOPS... no more input... + * we return, next lexing read + * will be null and ok */ + if (st->cp == NULL) + return; + c = st->cp; + } + /* we've reached the end of the comment */ + else if (*c == ')' && lastIsStar) + { + unfinished = false; + c++; + } + /* here we deal with imbricated comment, which + * are allowed in OCaml */ + else if (c[0] == '(' && c[1] == '*') + { + st->cp = c; + eatComment (st); + + c = st->cp; + if (c == NULL) + return; + + lastIsStar = false; + c++; + } + /* OCaml has a rule which says : + * + * "Comments do not occur inside string or character literals. + * Nested comments are handled correctly." + * + * So if we encounter a string beginning, we must parse it to + * get a good comment nesting (bug ID: 3117537) + */ + else if (*c == '"') + { + st->cp = c; + eatString (st); + c = st->cp; + } + else + { + lastIsStar = '*' == *c; + c++; + } + } + + st->cp = c; +} + +static void readIdentifier (lexingState * st) +{ + const unsigned char *p; + vStringClear (st->name); + + /* first char is a simple letter */ + if (isAlpha (*st->cp) || *st->cp == '_') + vStringPut (st->name, (int) *st->cp); + + /* Go till you get identifier chars */ + for (p = st->cp + 1; isIdent (*p); p++) + vStringPut (st->name, (int) *p); + + st->cp = p; +} + +static ocamlKeyword eatNumber (lexingState * st) +{ + while (isNum (*st->cp)) + st->cp++; + return Tok_Val; +} + +/* Operator can be defined in OCaml as a function + * so we must be ample enough to parse them normally */ +static ocamlKeyword eatOperator (lexingState * st) +{ + int count = 0; + const unsigned char *root = st->cp; + + vStringClear (st->name); + + while (isOperator[st->cp[count]]) + { + vStringPut (st->name, st->cp[count]); + count++; + } + + st->cp += count; + if (count <= 1) + { + switch (root[0]) + { + case '|': + return Tok_Pipe; + case '=': + return Tok_EQ; + case ':': + return Tok_Of; + default: + return Tok_Op; + } + } + else if (count == 2 && root[0] == '-' && root[1] == '>') + return Tok_To; + else if (count == 2 && root[0] == '|' && root[1] == '>') + return Tok_Op; + else + return Tok_Op; +} + +/* The lexer is in charge of reading the file. + * Some of sub-lexer (like eatComment) also read file. + * lexing is finished when the lexer return Tok_EOF */ +static ocamlKeyword lex (lexingState * st) +{ + int retType; + /* handling data input here */ + while (st->cp == NULL || st->cp[0] == '\0') + { + st->cp = readLineFromInputFile (); + if (st->cp == NULL) + return Tok_EOF; + } + + if (isAlpha (*st->cp)) + { + readIdentifier (st); + retType = lookupKeyword (vStringValue (st->name), Lang_Ocaml); + + if (retType == -1) /* If it's not a keyword */ + { + return OcaIDENTIFIER; + } + else + { + return retType; + } + } + else if (isNum (*st->cp)) + return eatNumber (st); + else if (isSpace (*st->cp)) + { + eatWhiteSpace (st); + return lex (st); + } + else if (*st->cp == '_') + { // special + readIdentifier (st); + return Tok_Val; + } + + /* OCaml permit the definition of our own operators + * so here we check all the consecutive chars which + * are operators to discard them. */ + else if (isOperator[*st->cp]) + return eatOperator (st); + else + { + switch (*st->cp) + { + case '(': + if (st->cp[1] == '*') /* ergl, a comment */ + { + eatComment (st); + return lex (st); + } + else + { + st->cp++; + return Tok_PARL; + } + + case ')': + st->cp++; + return Tok_PARR; + case '[': + st->cp++; + return Tok_BRL; + case ']': + st->cp++; + return Tok_BRR; + case '{': + st->cp++; + return Tok_CurlL; + case '}': + st->cp++; + return Tok_CurlR; + case '\'': + st->cp++; + return Tok_Prime; + case ',': + st->cp++; + return Tok_comma; + case '=': + st->cp++; + return Tok_EQ; + case ';': + st->cp++; + return Tok_semi; + case '"': + eatString (st); + return Tok_Val; + case '#': + st->cp++; + return Tok_Sharp; + case '\\': + st->cp++; + return Tok_Backslash; + default: + st->cp++; + break; + } + } + /* default return if nothing is recognized, + * shouldn't happen, but at least, it will + * be handled without destroying the parsing. */ + return Tok_Val; +} + +/*////////////////////////////////////////////////////////////////////// +//// Parsing */ +typedef void (*parseNext) (vString * const ident, ocaToken what, ocaToken whatNext); + +/********** Helpers */ +/* This variable hold the 'parser' which is going to + * handle the next token */ +static parseNext toDoNext; + +/* Special variable used by parser eater to + * determine which action to put after their + * job is finished. */ +static parseNext comeAfter; + +/* If a token put an end to current declaration/ + * statement */ +static ocaToken terminatingToken; + +/* Token to be searched by the different + * parser eater. */ +static ocaToken waitedToken; + +/* name of the last class, used for + * context stacking. */ +static vString *lastClass; + +typedef enum _sContextKind { + ContextStrong, + ContextSoft +} contextKind; + +typedef enum _sContextType { + ContextType, + ContextModule, + ContextClass, + ContextValue, + ContextFunction, + ContextMethod, + ContextBlock, + ContextMatch +} contextType; + +typedef struct _sOcamlContext { + contextKind kind; /* well if the context is strong or not */ + contextType type; + parseNext callback; /* what to do when a context is pop'd */ + vString *contextName; /* name, if any, of the surrounding context */ +} ocamlContext; + +/* context stack, can be used to output scope information + * into the tag file. */ +static ocamlContext stack[OCAML_MAX_STACK_SIZE]; +/* current position in the tag */ +static int stackIndex; + +/* special function, often recalled, so putting it here */ +static void globalScope (vString * const ident, ocaToken what, ocaToken whatNext); + +/* Return : index of the last named context if one + * is found, -1 otherwise */ +static int getLastNamedIndex ( void ) +{ + int i; + + for (i = stackIndex - 1; i >= 0; --i) + { + if (vStringLength (stack[i].contextName) > 0) + { + return i; + } + } + + return -1; +} + +static int contextDescription (contextType t) +{ + switch (t) + { + case ContextFunction: + return K_FUNCTION; + case ContextMethod: + return K_METHOD; + case ContextValue: + return K_VAL; + case ContextModule: + return K_MODULE; + case ContextType: + return K_TYPE; + case ContextClass: + return K_CLASS; + default: + AssertNotReached(); + return KIND_GHOST_INDEX; + } +} + +static char contextTypeSuffix (contextType t) +{ + switch (t) + { + case ContextFunction: + case ContextMethod: + case ContextValue: + case ContextModule: + return '/'; + case ContextType: + return '.'; + case ContextClass: + return '#'; + case ContextBlock: + return ' '; + case ContextMatch: + return '|'; + default: + return '$'; + } +} + +/* Push a new context, handle null string */ +static void pushContext (contextKind kind, contextType type, parseNext after, + vString const *contextName) +{ + int parentIndex; + + if (stackIndex >= OCAML_MAX_STACK_SIZE) + { + verbose ("OCaml Maximum depth reached"); + return; + } + + stack[stackIndex].kind = kind; + stack[stackIndex].type = type; + stack[stackIndex].callback = after; + + parentIndex = getLastNamedIndex (); + if (contextName == NULL) + { + vStringClear (stack[stackIndex++].contextName); + return; + } + + if (parentIndex >= 0) + { + vStringCopy (stack[stackIndex].contextName, + stack[parentIndex].contextName); + vStringPut (stack[stackIndex].contextName, + contextTypeSuffix (stack[parentIndex].type)); + + vStringCat (stack[stackIndex].contextName, contextName); + } + else + vStringCopy (stack[stackIndex].contextName, contextName); + + stackIndex++; +} + +static void pushStrongContext (vString * name, contextType type) +{ + pushContext (ContextStrong, type, &globalScope, name); +} + +static void pushSoftContext (parseNext continuation, + vString * name, contextType type) +{ + pushContext (ContextSoft, type, continuation, name); +} + +static void pushEmptyContext (parseNext continuation) +{ + pushContext (ContextSoft, ContextValue, continuation, NULL); +} + +/* unroll the stack until the last named context. + * then discard it. Used to handle the : + * let f x y = ... + * in ... + * where the context is reseted after the in. Context may have + * been really nested before that. */ +static void popLastNamed ( void ) +{ + int i = getLastNamedIndex (); + + if (i >= 0) + { + stackIndex = i; + toDoNext = stack[i].callback; + vStringClear (stack[i].contextName); + } + else + { + /* ok, no named context found... + * (should not happen). */ + stackIndex = 0; + toDoNext = &globalScope; + } +} + +/* pop a context without regarding it's content + * (beside handling empty stack case) */ +static void popSoftContext ( void ) +{ + if (stackIndex <= 0) + { + toDoNext = &globalScope; + } + else + { + stackIndex--; + toDoNext = stack[stackIndex].callback; + vStringClear (stack[stackIndex].contextName); + } +} + +/* Reset everything until the last global space. + * a strong context can be : + * - module + * - class definition + * - the initial global space + * - a _global_ declaration (let at global scope or in a module). + * Created to exit quickly deeply nested context */ +static contextType popStrongContext ( void ) +{ + int i; + + for (i = stackIndex - 1; i >= 0; --i) + { + if (stack[i].kind == ContextStrong) + { + stackIndex = i; + toDoNext = stack[i].callback; + vStringClear (stack[i].contextName); + return stack[i].type; + } + } + /* ok, no strong context found... */ + stackIndex = 0; + toDoNext = &globalScope; + return -1; +} + +/* Reset everything before the last match. */ +static void jumpToMatchContext ( void ) +{ + int i; + for (i = stackIndex - 1; i >= 0; --i) + { + if (stack[i].type == ContextMatch) + { + stackIndex = i + 1; + toDoNext = stack[i].callback; // this should always be + // matchPattern + stack[i + 1].callback = NULL; + vStringClear (stack[i + 1].contextName); + return; + } + } +} + +/* Ignore everything till waitedToken and jump to comeAfter. + * If the "end" keyword is encountered break, doesn't remember + * why though. */ +static void tillToken (vString * const ident CTAGS_ATTR_UNUSED, ocaToken what, ocaToken whatNext CTAGS_ATTR_UNUSED) +{ + if (what == waitedToken) + toDoNext = comeAfter; + else if (what == OcaKEYWORD_end) + { + popStrongContext (); + toDoNext = &globalScope; + } +} + +/* Ignore everything till a waitedToken is seen, but + * take care of balanced parentheses/bracket use */ +static void contextualTillToken (vString * const ident, ocaToken what, ocaToken whatNext) +{ + static int parentheses = 0; + static int bracket = 0; + static int curly = 0; + + switch (what) + { + case Tok_PARL: + parentheses--; + break; + case Tok_PARR: + parentheses++; + break; + case Tok_CurlL: + curly--; + break; + case Tok_CurlR: + curly++; + break; + case Tok_BRL: + bracket--; + break; + case Tok_BRR: + bracket++; + break; + + default: /* other token are ignored */ + break; + } + + if (what == waitedToken && parentheses == 0 && bracket == 0 && curly == 0) + toDoNext = comeAfter; + else if (what == OcaKEYWORD_end) + globalScope (ident, what, whatNext); +} + +/* Wait for waitedToken and jump to comeAfter or let + * the globalScope handle declarations */ +static void tillTokenOrFallback (vString * const ident, ocaToken what, ocaToken whatNext) +{ + if (what == waitedToken) + toDoNext = comeAfter; + else + globalScope (ident, what, whatNext); +} + +/* ignore token till waitedToken, or give up if find + * terminatingToken. Use globalScope to handle new + * declarations. */ +static void tillTokenOrTerminatingOrFallback (vString * const ident, ocaToken what, ocaToken whatNext) +{ + if (what == waitedToken) + toDoNext = comeAfter; + else if (what == terminatingToken) + toDoNext = globalScope; + else + globalScope (ident, what, whatNext); +} + +/* ignore the next token in the stream and jump to the + * given comeAfter state */ +static void ignoreToken (vString * const ident CTAGS_ATTR_UNUSED, ocaToken what CTAGS_ATTR_UNUSED, ocaToken whatNext CTAGS_ATTR_UNUSED) +{ + toDoNext = comeAfter; +} + +/********** Grammar */ +/* the purpose of each function is detailed near their + * implementation */ + +static contextType killCurrentState ( void ) +{ + contextType popped = popStrongContext (); + + /* Tracking the kind of previous strong + * context, if it doesn't match with a + * really strong entity, repop */ + switch (popped) + { + case ContextValue: + popped = popStrongContext (); + break; + case ContextFunction: + popped = popStrongContext (); + break; + case ContextMethod: + popped = popStrongContext (); + break; + case ContextType: + popped = popStrongContext (); + break; + case ContextMatch: + popped = popStrongContext (); + break; + case ContextBlock: + break; + case ContextModule: + break; + case ContextClass: + break; + default: + /* nothing more */ + break; + } + return popped; +} + +/* Keep track of our _true_ line number and file pos, + * as the lookahead token gives us false values. */ +static unsigned long ocaLineNumber; +static MIOPos ocaFilePosition; + +/* Used to prepare an OCaml tag, just in case there is a need to + * add additional information to the tag. */ +static void prepareTag (tagEntryInfo * tag, vString const *name, int kind) +{ + int parentIndex; + + initTagEntry (tag, vStringValue (name), kind); + /* Ripped out of read.h initTagEntry, because of line number + * shenanigans. + * Ugh. Lookahead is harder than I expected. */ + tag->lineNumber = ocaLineNumber; + tag->filePosition = ocaFilePosition; + + parentIndex = getLastNamedIndex (); + if (parentIndex >= 0) + { + tag->extensionFields.scopeKindIndex = + contextDescription (stack[parentIndex].type); + tag->extensionFields.scopeName = + vStringValue (stack[parentIndex].contextName); + } +} + +/* Used to centralise tag creation, and be able to add + * more information to it in the future */ +static void addTag (vString * const ident, int kind) +{ + if (OcamlKinds [kind].enabled && ident != NULL && vStringLength (ident) > 0) + { + tagEntryInfo toCreate; + prepareTag (&toCreate, ident, kind); + makeTagEntry (&toCreate); + } +} + +static bool needStrongPoping = false; +static void requestStrongPoping ( void ) +{ + needStrongPoping = true; +} + +static void cleanupPreviousParser ( void ) +{ + if (needStrongPoping) + { + needStrongPoping = false; + popStrongContext (); + } +} + +/* Due to some circular dependencies, the following functions + * must be forward-declared. */ +static void letParam (vString * const ident, ocaToken what, ocaToken whatNext); +static void localScope (vString * const ident, ocaToken what, ocaToken whatNext); +static void mayRedeclare (vString * const ident, ocaToken what, ocaToken whatNext); +static void typeSpecification (vString * const ident, ocaToken what, ocaToken whatNext); + +/* + * Parse a record type + * type ident = // parsed previously + * { + * ident1: type1; + * ident2: type2; + * } + */ +static void typeRecord (vString * const ident, ocaToken what, ocaToken whatNext CTAGS_ATTR_UNUSED) +{ + switch (what) + { + case OcaIDENTIFIER: + addTag (ident, K_RECORDFIELD); + terminatingToken = Tok_CurlR; + waitedToken = Tok_semi; + comeAfter = &typeRecord; + toDoNext = &tillTokenOrTerminatingOrFallback; + break; + + case OcaKEYWORD_mutable: + /* ignore it */ + break; + + case Tok_CurlR: + popStrongContext (); + // don't pop the module context when going to another expression + needStrongPoping = false; + toDoNext = &globalScope; + break; + + default: /* don't care */ + break; + } +} + +/* handle : + * exception ExceptionName of ... */ +static void exceptionDecl (vString * const ident, ocaToken what, ocaToken whatNext) +{ + if (what == OcaIDENTIFIER) + { + addTag (ident, K_EXCEPTION); + } + else /* probably ill-formed, give back to global scope */ + { + globalScope (ident, what, whatNext); + } + toDoNext = &globalScope; +} + +static tagEntryInfo tempTag; +static vString *tempIdent; + +/* Ensure a constructor is not a type path beginning + * with a module */ +static void constructorValidation (vString * const ident, ocaToken what, ocaToken whatNext) +{ + switch (what) + { + case Tok_Op: /* if we got a '.' which is an operator */ + toDoNext = &globalScope; + popStrongContext (); + needStrongPoping = false; + break; + + case OcaKEYWORD_of: /* OK, it must be a constructor :) */ + if (vStringLength (tempIdent) > 0) + { + makeTagEntry (&tempTag); + vStringClear (tempIdent); + } + toDoNext = &tillTokenOrFallback; + comeAfter = &typeSpecification; + waitedToken = Tok_Pipe; + break; + + case Tok_Pipe: /* OK, it was a constructor :) */ + if (vStringLength (tempIdent) > 0) + { + makeTagEntry (&tempTag); + vStringClear (tempIdent); + } + toDoNext = &typeSpecification; + break; + + default: /* and mean that we're not facing a module name */ + if (vStringLength (tempIdent) > 0) + { + makeTagEntry (&tempTag); + vStringClear (tempIdent); + } + toDoNext = &tillTokenOrFallback; + comeAfter = &typeSpecification; + waitedToken = Tok_Pipe; + + popStrongContext (); + + // don't pop the module context when going to another expression + needStrongPoping = false; + + /* to be sure we use this token */ + globalScope (ident, what, whatNext); + } +} + +/* Parse beginning of type definition + * type 'avar ident = + * or + * type ('var1, 'var2) ident = + */ +static void typeDecl (vString * const ident, ocaToken what, ocaToken whatNext) +{ + switch (what) + { + /* parameterized */ + case Tok_Prime: + comeAfter = &typeDecl; + toDoNext = &ignoreToken; + break; + /* LOTS of parameters */ + case Tok_PARL: + comeAfter = &typeDecl; + waitedToken = Tok_PARR; + toDoNext = &tillToken; + break; + + case OcaIDENTIFIER: + addTag (ident, K_TYPE); + // true type declaration + if (whatNext == Tok_EQ) + { + pushStrongContext (ident, ContextType); + requestStrongPoping (); + toDoNext = &typeSpecification; + } + else // we're in a sig + toDoNext = &globalScope; + break; + + default: + globalScope (ident, what, whatNext); + } +} + +/** handle 'val' signatures in sigs and .mli files + * val ident : String.t -> Val.t + * Eventually, this will do cool things to annotate + * functions with their actual signatures. But for now, + * it's basically globalLet */ +static void val (vString * const ident, ocaToken what, ocaToken whatNext CTAGS_ATTR_UNUSED) +{ + switch (what) + { + case Tok_PARL: + case OcaKEYWORD_rec: + break; + + case Tok_Op: + /* we are defining a new operator, it's a + * function definition */ + addTag (ident, K_VAL); + toDoNext = &globalScope; + break; + + case Tok_Val: /* Can be a weiiird binding, or an '_' */ + case OcaIDENTIFIER: + addTag (ident, K_VAL); + toDoNext = &globalScope; // sig parser ? + break; + + default: + toDoNext = &globalScope; + break; + } +} + +/* Parse type of kind + * type bidule = Ctor1 of ... + * | Ctor2 + * | Ctor3 of ... + * or + * type bidule = | Ctor1 of ... | Ctor2 + * + * when type bidule = { ... } is detected, + * let typeRecord handle it. */ +static void typeSpecification (vString * const ident, ocaToken what, ocaToken whatNext CTAGS_ATTR_UNUSED) +{ + switch (what) + { + case OcaIDENTIFIER: + if (isUpperAlpha (vStringChar (ident, 0))) + { + /* here we handle type aliases of type + * type foo = AnotherModule.bar + * AnotherModule can mistakenly be took + * for a constructor. */ + if (! OcamlKinds[K_CONSTRUCTOR].enabled) + vStringClear (tempIdent); + else + { + vStringCopy (tempIdent, ident); + prepareTag (&tempTag, tempIdent, K_CONSTRUCTOR); + } + toDoNext = &constructorValidation; + } + else + { + toDoNext = &tillTokenOrFallback; + comeAfter = &typeSpecification; + waitedToken = Tok_Pipe; + } + break; + + case OcaKEYWORD_and: + toDoNext = &typeDecl; + break; + + case OcaKEYWORD_val: + toDoNext = &val; + break; + + case Tok_BRL: /* the '[' & ']' are ignored to accommodate */ + case Tok_BRR: /* with the revised syntax */ + case Tok_Pipe: + /* just ignore it */ + break; + + case Tok_CurlL: + toDoNext = &typeRecord; + break; + + default: /* don't care */ + break; + } +} + + +static bool dirtySpecialParam = false; + +/* parse the ~label and ~label:type parameter */ +static void parseLabel (vString * const ident, ocaToken what, ocaToken whatNext) +{ + static int parCount = 0; + + switch (what) + { + case OcaIDENTIFIER: + if (!dirtySpecialParam) + { + if (exportLocalInfo) + addTag (ident, K_VARIABLE); + + dirtySpecialParam = true; + } + break; + + case Tok_PARL: + parCount++; + break; + + case Tok_PARR: + parCount--; + if (parCount == 0) + toDoNext = &letParam; + break; + + case Tok_Op: + if (vStringChar(ident, 0) == ':') + { + toDoNext = &ignoreToken; + comeAfter = &letParam; + } + else if (parCount == 0 && dirtySpecialParam) + { + toDoNext = &letParam; + letParam (ident, what, whatNext); + } + break; + + default: + if (parCount == 0 && dirtySpecialParam) + { + toDoNext = &letParam; + letParam (ident, what, whatNext); + } + break; + } +} + +/* Optional argument with syntax like this : + * ?(foo = value) */ +static void parseOptionnal (vString * const ident, ocaToken what, ocaToken whatNext CTAGS_ATTR_UNUSED) +{ + static int parCount = 0; + + switch (what) + { + case OcaIDENTIFIER: + if (!dirtySpecialParam) + { + if (exportLocalInfo) + addTag (ident, K_VARIABLE); + + dirtySpecialParam = true; + + if (parCount == 0) + toDoNext = &letParam; + } + break; + + case Tok_PARL: + parCount++; + break; + + case Tok_PARR: + parCount--; + if (parCount == 0) + toDoNext = &letParam; + break; + + default: /* don't care */ + break; + } +} + +/** handle let inside functions (so like it's name + * say : local let */ +static void localLet (vString * const ident, ocaToken what, ocaToken whatNext) +{ + switch (what) + { + case Tok_PARL: + /* We ignore this token to be able to parse such + * declarations : + * let (ident : type) = ... + */ + break; + + case OcaKEYWORD_rec: + /* just ignore to be able to parse such declarations: + * let rec ident = ... */ + break; + + case Tok_Op: + /* we are defining a new operator, it's a + * function definition */ + if (exportLocalInfo) + addTag (ident, K_FUNCTION); + pushSoftContext (mayRedeclare, ident, ContextFunction); + toDoNext = &letParam; + break; + + case Tok_Val: /* Can be a weiiird binding, or an '_' */ + case OcaIDENTIFIER: + // if we're an identifier, and the next token is too, then + // we're definitely a function. + if (whatNext == OcaIDENTIFIER || whatNext == Tok_PARL) + { + if (exportLocalInfo) + addTag (ident, K_FUNCTION); + pushSoftContext (mayRedeclare, ident, ContextFunction); + } + else + { + if (exportLocalInfo) + addTag (ident, K_VARIABLE); + pushSoftContext (mayRedeclare, ident, ContextValue); + } + toDoNext = &letParam; + break; + + case OcaKEYWORD_end: + localScope (ident, what, whatNext); + break; + + default: + toDoNext = &localScope; + break; + } +} + +/* parse : + * | pattern pattern -> ... + * or + * pattern apttern apttern -> ... + * we ignore all identifiers declared in the pattern, + * because their scope is likely to be even more limited + * than the let definitions. + * Used after a match ... with, or a function ... + * because their syntax is similar. */ +static void matchPattern (vString * const ident, ocaToken what, ocaToken whatNext) +{ + /* keep track of [], as it + * can be used in patterns and can + * mean the end of match expression in + * revised syntax */ + static int braceCount = 0; + + switch (what) + { + case Tok_To: + pushEmptyContext (&matchPattern); + toDoNext = &mayRedeclare; + break; + + case Tok_BRL: + braceCount++; + break; + + case OcaKEYWORD_value: + popLastNamed (); + case OcaKEYWORD_and: + case OcaKEYWORD_end: + // why was this global? matches only make sense in local scope + localScope (ident, what, whatNext); + break; + + case OcaKEYWORD_in: + popLastNamed (); + break; + + default: + break; + } +} + +/* Used at the beginning of a new scope (begin of a + * definition, parenthesis...) to catch inner let + * definition that may be in. */ +static void mayRedeclare (vString * const ident, ocaToken what, ocaToken whatNext) +{ + switch (what) + { + case OcaKEYWORD_value: + /* let globalScope handle it */ + globalScope (ident, what, whatNext); + + case OcaKEYWORD_let: + toDoNext = &localLet; + break; + + case OcaKEYWORD_val: + toDoNext = &val; + break; + + case OcaKEYWORD_object: + vStringClear (lastClass); + pushContext (ContextStrong, ContextClass, + &localScope, NULL); + needStrongPoping = false; + toDoNext = &globalScope; + break; + + case OcaKEYWORD_for: + case OcaKEYWORD_while: + toDoNext = &tillToken; + waitedToken = OcaKEYWORD_do; + comeAfter = &mayRedeclare; + break; + + case OcaKEYWORD_try: + toDoNext = &mayRedeclare; + pushSoftContext (&matchPattern, ident, ContextFunction); + break; + + case OcaKEYWORD_function: + toDoNext = &matchPattern; + pushSoftContext (&matchPattern, NULL, ContextMatch); + break; + + case OcaKEYWORD_fun: + toDoNext = &letParam; + break; + + /* Handle the special ;; from the OCaml + * Top level */ + case Tok_semi: + default: + toDoNext = &localScope; + localScope (ident, what, whatNext); + } +} + +/* parse : + * p1 p2 ... pn = ... + * or + * ?(p1=v) p2 ~p3 ~pn:ja ... = ... */ +static void letParam (vString * const ident, ocaToken what, ocaToken whatNext CTAGS_ATTR_UNUSED) +{ + switch (what) + { + case Tok_To: + case Tok_EQ: + toDoNext = &mayRedeclare; + break; + + case OcaIDENTIFIER: + if (exportLocalInfo) + addTag (ident, K_VARIABLE); + break; + + case Tok_Op: + switch (vStringChar (ident, 0)) + { + case ':': + /*popSoftContext(); */ + /* we got a type signature */ + comeAfter = &mayRedeclare; + toDoNext = &tillTokenOrFallback; + waitedToken = Tok_EQ; + break; + + /* parse something like + * ~varname:type + * or + * ~varname + * or + * ~(varname: long type) */ + case '~': + toDoNext = &parseLabel; + dirtySpecialParam = false; + break; + + /* Optional argument with syntax like this : + * ?(bla = value) + * or + * ?bla */ + case '?': + toDoNext = &parseOptionnal; + dirtySpecialParam = false; + break; + + default: + break; + } + break; + + default: /* don't care */ + break; + } +} + +/* parse object ... + * used to be sure the class definition is not a type + * alias */ +static void classSpecif (vString * const ident CTAGS_ATTR_UNUSED, ocaToken what, ocaToken whatNext CTAGS_ATTR_UNUSED) +{ + switch (what) + { + case OcaKEYWORD_object: + pushStrongContext (lastClass, ContextClass); + toDoNext = &globalScope; + break; + + default: + vStringClear (lastClass); + toDoNext = &globalScope; + } +} + +/* Handle a method ... class declaration. + * nearly a copy/paste of globalLet. */ +static void methodDecl (vString * const ident, ocaToken what, ocaToken whatNext) +{ + switch (what) + { + case Tok_PARL: + /* We ignore this token to be able to parse such + * declarations : + * let (ident : type) = ... */ + break; + + case OcaKEYWORD_mutable: + case OcaKEYWORD_virtual: + case OcaKEYWORD_rec: + /* just ignore to be able to parse such declarations: + * let rec ident = ... */ + break; + + case OcaIDENTIFIER: + addTag (ident, K_METHOD); + /* Normal pushing to get good subs */ + pushStrongContext (ident, ContextMethod); + /*pushSoftContext( globalScope, ident, ContextMethod ); */ + toDoNext = &letParam; + break; + + case OcaKEYWORD_end: + localScope (ident, what, whatNext); + break; + + default: + toDoNext = &globalScope; + break; + } +} + +/* name of the last module, used for + * context stacking. */ +static vString *lastModule; + +/* parse + * ... struct (* new global scope *) end + * or + * ... sig (* new global scope *) end + * or + * functor ... -> moduleSpecif + */ +static void moduleSpecif (vString * const ident, ocaToken what, ocaToken whatNext) +{ + switch (what) + { + case OcaKEYWORD_functor: + toDoNext = &contextualTillToken; + waitedToken = Tok_To; + comeAfter = &moduleSpecif; + break; + + case OcaKEYWORD_struct: + case OcaKEYWORD_sig: + pushStrongContext (lastModule, ContextModule); + toDoNext = &globalScope; + needStrongPoping = false; + break; + + case Tok_PARL: /* ( */ + toDoNext = &contextualTillToken; + comeAfter = &globalScope; + waitedToken = Tok_PARR; + contextualTillToken (ident, what, whatNext); + break; + + case Tok_Of: + case Tok_EQ: + break; + + default: + vStringClear (lastModule); + toDoNext = &globalScope; + break; + } +} + +/* parse : + * module name = ... + * then pass the token stream to moduleSpecif */ +static void moduleDecl (vString * const ident, ocaToken what, ocaToken whatNext) +{ + switch (what) + { + case OcaKEYWORD_rec: + /* recursive modules are _weird_, but they happen */ + case OcaKEYWORD_type: + /* this is technically a special type, but whatever */ + break; + + case OcaIDENTIFIER: + addTag (ident, K_MODULE); + vStringCopy (lastModule, ident); + if (whatNext == Tok_Of || whatNext == Tok_EQ) + toDoNext = &moduleSpecif; + else + { + // default to waiting on a '=' since + // module M : sig ... end = struct ... end + // is rarer + waitedToken = Tok_EQ; + comeAfter = &moduleSpecif; + toDoNext = &contextualTillToken; + } + break; + + default: /* don't care */ + break; + } +} + +/* parse : + * class name = ... + * or + * class virtual ['a,'b] classname = ... */ +static void classDecl (vString * const ident, ocaToken what, ocaToken whatNext CTAGS_ATTR_UNUSED) +{ + switch (what) + { + case OcaIDENTIFIER: + addTag (ident, K_CLASS); + vStringCopy (lastClass, ident); + toDoNext = &contextualTillToken; + waitedToken = Tok_EQ; + comeAfter = &classSpecif; + break; + + case Tok_BRL: + toDoNext = &tillToken; + waitedToken = Tok_BRR; + comeAfter = &classDecl; + break; + + default: + break; + } +} + +/* Handle a global + * let ident ... + * or + * let rec ident ... */ +static void globalLet (vString * const ident, ocaToken what, ocaToken whatNext) +{ + switch (what) + { + case Tok_PARL: + /* We ignore this token to be able to parse such + * declarations : + * let (ident : type) = ... + * but () is the toplevel function name, so fake ourselves + * as an ident and make a new function */ + if (whatNext == Tok_PARR) + { + vString *fakeIdent = vStringNewInit ("()"); + addTag (fakeIdent, K_FUNCTION); + pushStrongContext (fakeIdent, ContextFunction); + vStringDelete (fakeIdent); + requestStrongPoping (); + toDoNext = &letParam; + } + break; + + case OcaKEYWORD_mutable: + case OcaKEYWORD_virtual: + case OcaKEYWORD_rec: + /* just ignore to be able to parse such declarations: + * let rec ident = ... */ + break; + + case Tok_Op: + /* we are defining a new operator, it's a + * function definition */ + addTag (ident, K_FUNCTION); + pushStrongContext (ident, ContextFunction); + toDoNext = &letParam; + break; + + case Tok_Val: + if (vStringValue (ident)[0] == '_') + addTag (ident, K_FUNCTION); + pushStrongContext (ident, ContextFunction); + requestStrongPoping (); + toDoNext = &letParam; + break; + + case OcaIDENTIFIER: + // if we're an identifier, and the next token is too, then + // we're definitely a function. + if (whatNext == OcaIDENTIFIER || whatNext == Tok_PARL) + { + addTag (ident, K_FUNCTION); + pushStrongContext (ident, ContextFunction); + } + else + { + addTag (ident, K_VARIABLE); + pushStrongContext (ident, ContextValue); + } + requestStrongPoping (); + toDoNext = &letParam; + break; + + case OcaKEYWORD_end: + globalScope (ident, what, whatNext); + break; + + default: + toDoNext = &globalScope; + break; + } +} + +/* Handle the "strong" top levels, all 'big' declarations + * happen here */ +static void globalScope (vString * const ident CTAGS_ATTR_UNUSED, ocaToken what, ocaToken whatNext) +{ + /* Do not touch, this is used only by the global scope + * to handle an 'and' */ + static parseNext previousParser = &globalScope; + + switch (what) + { + case OcaKEYWORD_and: + cleanupPreviousParser (); + // deal with module M = struct ... end _and_ N = struct ... end + toDoNext = previousParser; + break; + + case OcaKEYWORD_type: + cleanupPreviousParser (); + toDoNext = &typeDecl; + previousParser = &typeDecl; + break; + + case OcaKEYWORD_class: + cleanupPreviousParser (); + toDoNext = &classDecl; + previousParser = &classDecl; + break; + + case OcaKEYWORD_module: + cleanupPreviousParser (); + toDoNext = &moduleDecl; + previousParser = &moduleDecl; + break; + + case OcaKEYWORD_end:; + contextType popped = killCurrentState (); + + /** so here, end can legally be followed by = or and in the + * situation of + * module M : sig ... end = struct ... end and + * module M struct ... end and N = struct ... end + * and we need to make sure we know we're still inside of a + * struct */ + if (whatNext == Tok_EQ && popped == ContextModule) + { + previousParser = &moduleDecl; + toDoNext = &moduleSpecif; + } + else if (whatNext == OcaKEYWORD_and && popped == ContextModule) + toDoNext = &moduleDecl; + needStrongPoping = false; + break; + + case OcaKEYWORD_method: + cleanupPreviousParser (); + toDoNext = &methodDecl; + /* and is not allowed in methods */ + break; + + case OcaKEYWORD_val: + toDoNext = &val; + /* and is not allowed in sigs */ + break; + + case OcaKEYWORD_let: + cleanupPreviousParser (); + toDoNext = &globalLet; + previousParser = &globalLet; + break; + + case OcaKEYWORD_exception: + cleanupPreviousParser (); + toDoNext = &exceptionDecl; + previousParser = &globalScope; + break; + + /* must be a #line directive, discard the + * whole line. */ + case Tok_Sharp: + /* ignore */ + break; + + default: + /* we don't care */ + break; + } +} + +/* Parse expression. Well ignore it is more the case, + * ignore all tokens except "shocking" keywords */ +static void localScope (vString * const ident, ocaToken what, ocaToken whatNext) +{ + switch (what) + { + + // we're probably in a match, so let's go to the last one + case Tok_Pipe: + jumpToMatchContext (); + break; + + case Tok_PARR: + case Tok_BRR: + case Tok_CurlR: + popSoftContext (); + break; + + /* Everything that `begin` has an `end` + * as end is overloaded and signal many end + * of things, we add an empty strong context to + * avoid problem with the end. + */ + case OcaKEYWORD_begin: + pushContext (ContextStrong, ContextBlock, &mayRedeclare, NULL); + toDoNext = &mayRedeclare; + break; + + /* An in keyword signals the end of the previous context and the + * start of a new one. */ + case OcaKEYWORD_in: + popLastNamed (); + pushEmptyContext (&localScope); + toDoNext = &mayRedeclare; + break; + + /* Ok, we got a '{', which is much likely to create + * a record. We cannot treat it like other [ && (, + * because it may contain the 'with' keyword and screw + * everything else. */ + case Tok_CurlL: + toDoNext = &contextualTillToken; + waitedToken = Tok_CurlR; + comeAfter = &localScope; + contextualTillToken (ident, what, whatNext); + break; + + /* Yeah imperative feature of OCaml, + * a ';' like in C */ + case Tok_semi: + /* ';;' case should end all scopes */ + if (whatNext == Tok_semi) + { + popStrongContext (); + toDoNext = &globalScope; + break; + } /* else fallthrough */ + + /* Every standard operator has very high precedence + * e.g. expr * expr needs no parentheses */ + case Tok_Op: + toDoNext = &mayRedeclare; + break; + + case Tok_PARL: + case Tok_BRL: + pushEmptyContext (&localScope); + toDoNext = &mayRedeclare; + break; + + case OcaKEYWORD_and: + if (toDoNext == &mayRedeclare) + { + popSoftContext (); + pushEmptyContext (localScope); + toDoNext = &localLet; + } + else + { + /* a local 'and' keyword jumps up a context to the last + * named. For ex + * in `with let IDENT ... and IDENT2 ...` ident and + * ident2 are on + * same level, the same as `let IDENT ... in let IDENT2 + * ...` + * a 'let' is the only 'and'-chainable construct allowed + * locally + * (thus we had to be one to get here), so we either go + * to + * globalLet or localLet depending on our scope. */ + popLastNamed (); + toDoNext = stackIndex == 0 ? &globalLet : &localLet; + } + break; + + case OcaKEYWORD_else: + case OcaKEYWORD_then: + popSoftContext (); + pushEmptyContext (&localScope); + toDoNext = &mayRedeclare; + break; + + case OcaKEYWORD_if: + pushEmptyContext (&localScope); + toDoNext = &mayRedeclare; + break; + + case OcaKEYWORD_match: + pushEmptyContext (&localScope); + toDoNext = &mayRedeclare; + break; + + case OcaKEYWORD_with: + popSoftContext (); + toDoNext = &matchPattern; + pushSoftContext (&matchPattern, NULL, ContextMatch); + break; + + case OcaKEYWORD_fun: + toDoNext = &letParam; + break; + + case OcaKEYWORD_done: + /* doesn't care */ + break; + + default: + requestStrongPoping (); + globalScope (ident, what, whatNext); + break; + } +} + +/*//////////////////////////////////////////////////////////////// +//// Deal with the system */ +/* in OCaml the file name is the module name used in the language + * with it first letter put in upper case */ +static void computeModuleName ( void ) +{ + /* in OCaml the file name define a module. + * so we define a module if the file has + * things in it. =) + */ + const char *filename = getInputFileName (); + + int beginIndex = 0; + int endIndex = strlen (filename) - 1; + vString *moduleName = vStringNew (); + + while (filename[endIndex] != '.' && endIndex > 0) + endIndex--; + + /* avoid problem with path in front of filename */ + beginIndex = endIndex; + while (beginIndex > 0) + { + if (filename[beginIndex] == '\\' || filename[beginIndex] == '/') + { + beginIndex++; + break; + } + + beginIndex--; + } + + vStringNCopyS (moduleName, &filename[beginIndex], endIndex - beginIndex); + + if (isLowerAlpha (vStringChar (moduleName, 0))) + vStringChar (moduleName, 0) += ('A' - 'a'); + + addTag (moduleName, K_MODULE); + vStringDelete (moduleName); +} + +/* Allocate all string of the context stack */ +static void initStack ( void ) +{ + int i; + for (i = 0; i < OCAML_MAX_STACK_SIZE; ++i) + stack[i].contextName = vStringNew (); + stackIndex = 0; +} + +static void clearStack ( void ) +{ + int i; + for (i = 0; i < OCAML_MAX_STACK_SIZE; ++i) + vStringDelete (stack[i].contextName); +} + +static void findOcamlTags (void) +{ + lexingState st; + ocaToken tok; + + /* One-token lookahead gives us the ability to + * do much more accurate analysis */ + lexingState nextSt; + ocaToken nextTok; + + initStack (); + + tempIdent = vStringNew (); + lastModule = vStringNew (); + lastClass = vStringNew (); + vString *temp_cp = vStringNew (); + + nextSt.name = vStringNew (); + nextSt.cp = readLineFromInputFile (); + ocaLineNumber = getInputLineNumber(); + ocaFilePosition = getInputFilePosition(); + toDoNext = &globalScope; + nextTok = lex (&nextSt); + + if (nextTok != Tok_EOF) + computeModuleName (); + + /* prime the lookahead token */ + st = nextSt; // preserve the old state for our first token + st.name = vStringNewCopy (st.name); + st.cp = (const unsigned char *) vStringValue (temp_cp); + tok = nextTok; + ocaLineNumber = getInputLineNumber(); /* ??? getSourceLineNumber() */ + ocaFilePosition = getInputFilePosition(); + nextTok = lex (&nextSt); + + /* main loop */ + while (tok != Tok_EOF) + { + (*toDoNext) (st.name, tok, nextTok); + + tok = nextTok; + ocaLineNumber = getInputLineNumber(); /* ??? */ + ocaFilePosition = getInputFilePosition(); + + if (nextTok != Tok_EOF) + { + vStringCopyS (temp_cp, (const char *) nextSt.cp); + st.cp = (const unsigned char *) vStringValue (temp_cp); + vStringCopy (st.name, nextSt.name); + nextTok = lex (&nextSt); + } + else + break; + } + + vStringDelete (st.name); + vStringDelete (nextSt.name); + vStringDelete (temp_cp); + vStringDelete (tempIdent); + vStringDelete (lastModule); + vStringDelete (lastClass); + clearStack (); +} + +static void ocamlInitialize (const langType language) +{ + Lang_Ocaml = language; + + initOperatorTable (); +} + +extern parserDefinition *OcamlParser (void) +{ + static const char *const extensions[] = { "ml", "mli", "aug", NULL }; + static const char *const aliases[] = { "tuareg", /* mode name of emacs */ + "caml", /* mode name of emacs */ + NULL }; + parserDefinition *def = parserNew ("OCaml"); + def->kindTable = OcamlKinds; + def->kindCount = ARRAY_SIZE (OcamlKinds); + def->extensions = extensions; + def->aliases = aliases; + def->parser = findOcamlTags; + def->initialize = ocamlInitialize; + def->keywordTable = OcamlKeywordTable; + def->keywordCount = ARRAY_SIZE (OcamlKeywordTable); + return def; +} diff --git a/data/filedefs/filetypes.fortran b/data/filedefs/filetypes.fortran index 1521247bef..143bdc686a 100644 --- a/data/filedefs/filetypes.fortran +++ b/data/filedefs/filetypes.fortran @@ -3,8 +3,8 @@ [keywords] # all items must be in one line -primary=abstract access action advance all allstop allocatable allocate apostrophe assign assignment associate asynchronous backspace bind blank block blockdata call case character class close codimension common complex concurrent contains contiguous continue critical cycle data deallocate decimal delim default dimension direct do dowhile double doubleprecision elemental else elseif elsewhere encoding end endassociate endblock endblockdata endcritical enddo endfile endforall endfunction endif endinterface endmodule endprocedure endprogram endselect endsubmodule endsubroutine endtype endwhere entry enum enumerator eor equivalence err errmsg exist exit extends external file final flush fmt forall form format formatted function generic go goto id if images implicit import impure in include inout integer inquire intent interface intrinsic iomsg iolength iostat is kind len lock logical memory module name named namelist nextrec nml non_intrinsic non_overridable none nopass nullify number only open opened operator optional out pad parameter pass pause pending pointer pos position precision print private procedure program protected public quote pure read readwrite real rec recl recursive result return rewind save select selectcase selecttype sequential sign size stat status stop stream submodule subroutine sync syncall syncimages syncmemory target then to type unformatted unit unlock use value volatile wait where while write -intrinsic_functions=abs achar acos acosd acosh adjustl adjustr aimag aimax0 aimin0 aint ajmax0 ajmin0 akmax0 akmin0 all allocated alog alog10 amax0 amax1 amin0 amin1 amod anint any asin asind asinh associated atan atan2 atan2d atand atanh atomic_define atomic_ref bessel_j0 bessel_j1 bessel_jn bessel_y0 bessel_y1 bessel_yn bge bgt bit_size bitest bitl bitlr bitrl bjtest bktest ble blt break btest c_associated c_f_pointer c_f_procpointer c_funloc c_loc c_sizeof cabs ccos cdabs cdcos cdexp cdlog cdsin cdsqrt ceiling cexp char clog cmplx command_argument_count conjg cos cosd cosh count cpu_time cshift csin csqrt dabs dacos dacosd dasin dasind datan datan2 datan2d datand date date_and_time dble dcmplx dconjg dcos dcosd dcosh dcotan ddim dexp dfloat dfloti dflotj dflotk digits dim dimag dint dlog dlog10 dmax1 dmin1 dmod dnint dot_product dprod dreal dshiftl dshiftr dsign dsin dsind dsinh dsqrt dtan dtand dtanh eoshift epsilon erf erfc erfc_scaled errsns execute_command_line exp exponent extends_type_of findloc float floati floatj floatk floor fraction free gamma get_command get_command_argument get_environment_variable huge hypot iabs iachar iall iand iany ibclr ibits ibset ichar idate idim idint idnint ieor ifix iiabs iiand iibclr iibits iibset iidim iidint iidnnt iieor iifix iint iior iiqint iiqnnt iishft iishftc iisign ilen image_index imax0 imax1 imin0 imin1 imod index inint inot int int1 int2 int4 int8 ior iparity iqint iqnint is_contiguous is_isostat_end is_isostat_eor ishft ishftc isign isnan izext jiand jibclr jibits jibset jidim jidint jidnnt jieor jifix jint jior jiqint jiqnnt jishft jishftc jisign jmax0 jmax1 jmin0 jmin1 jmod jnint jnot jzext kiabs kiand kibclr kibits kibset kidim kidint kidnnt kieor kifix kind kint kior kishft kishftc kisign kmax0 kmax1 kmin0 kmin1 kmod knint knot kzext lbound lcobound leadz len len_trim lge lgt lle llt log log10 log_gamma logical lshift malloc maskl maskr matmul max max0 max1 maxexponent maxloc maxval merge merge_bits min min0 min1 minexponent minloc minval mod modulo move_alloc mvbits nearest new_line nint norm2 not null num_images number_of_processors nworkers pack parity popcnt poppar precision present product radix random random_number random_seed range real repeat reshape rrspacing rshift same_type_as scale scan secnds selected_char_kind selected_int_kind selected_real_kind set_exponent shape shifta shiftl shiftr sign sin sind sinh size sizeof sngl snglq spacing spread sqrt storage_size sum system_clock tan tand tanh this_image tiny trailz transfer transpose trim ubound ucobound unpack verify +primary=abstract access action advance all allstop allocatable allocate apostrophe assign assignment associate asynchronous backspace bind blank block blockdata call case character class close codimension common complex concurrent contains contiguous continue critical cycle data deallocate decimal delim default dimension direct do dowhile double doubleprecision elemental else elseif elsewhere encoding end endassociate endblock endblockdata endcritical enddo endfile endforall endfunction endif endinterface endmodule endprocedure endprogram endselect endsubmodule endsubroutine endtype endwhere entry enum enumerator eor equivalence err errmsg exist exit extends external file final flush fmt forall form format formatted function generic go goto id if images implicit import impure in include inout integer inquire intent interface intrinsic iomsg iolength iostat is kind len lock logical memory module name named namelist nextrec nml non_intrinsic non_overridable none nopass nullify number only open opened operator optional out pad parameter pass pause pending pointer pos position precision print private procedure program protected public quote pure read readwrite real rec recl recursive result return rewind save select selectcase selecttype sequential sign size stat status stop stream submodule subroutine sync syncall syncimages syncmemory target then to type unformatted unit unlock use value volatile wait where while write change rank team local_init error newunit local team_number round mold source until_count sequence image quiet non_recursive fail event deferred acquired_lock post shared new_index +intrinsic_functions=abs achar acos acosd acosh adjustl adjustr aimag aimax0 aimin0 aint ajmax0 ajmin0 akmax0 akmin0 all allocated alog alog10 amax0 amax1 amin0 amin1 amod anint any asin asind asinh associated atan atan2 atan2d atand atanh atomic_define atomic_ref bessel_j0 bessel_j1 bessel_jn bessel_y0 bessel_y1 bessel_yn bge bgt bit_size bitest bitl bitlr bitrl bjtest bktest ble blt break btest c_associated c_f_pointer c_f_procpointer c_funloc c_loc c_sizeof cabs ccos cdabs cdcos cdexp cdlog cdsin cdsqrt ceiling cexp char clog cmplx command_argument_count conjg cos cosd cosh count cpu_time cshift csin csqrt dabs dacos dacosd dasin dasind datan datan2 datan2d datand date date_and_time dble dcmplx dconjg dcos dcosd dcosh dcotan ddim dexp dfloat dfloti dflotj dflotk digits dim dimag dint dlog dlog10 dmax1 dmin1 dmod dnint dot_product dprod dreal dshiftl dshiftr dsign dsin dsind dsinh dsqrt dtan dtand dtanh eoshift epsilon erf erfc erfc_scaled errsns execute_command_line exp exponent extends_type_of findloc float floati floatj floatk floor fraction free gamma get_command get_command_argument get_environment_variable huge hypot iabs iachar iall iand iany ibclr ibits ibset ichar idate idim idint idnint ieor ifix iiabs iiand iibclr iibits iibset iidim iidint iidnnt iieor iifix iint iior iiqint iiqnnt iishft iishftc iisign ilen image_index imax0 imax1 imin0 imin1 imod index inint inot int int1 int2 int4 int8 ior iparity iqint iqnint is_contiguous is_isostat_end is_isostat_eor ishft ishftc isign isnan izext jiand jibclr jibits jibset jidim jidint jidnnt jieor jifix jint jior jiqint jiqnnt jishft jishftc jisign jmax0 jmax1 jmin0 jmin1 jmod jnint jnot jzext kiabs kiand kibclr kibits kibset kidim kidint kidnnt kieor kifix kind kint kior kishft kishftc kisign kmax0 kmax1 kmin0 kmin1 kmod knint knot kzext lbound lcobound leadz len len_trim lge lgt lle llt log log10 log_gamma logical lshift malloc maskl maskr matmul max max0 max1 maxexponent maxloc maxval merge merge_bits min min0 min1 minexponent minloc minval mod modulo move_alloc mvbits nearest new_line nint norm2 not null num_images number_of_processors nworkers pack parity popcnt poppar precision present product radix random random_number random_seed range real repeat reshape rrspacing rshift same_type_as scale scan secnds selected_char_kind selected_int_kind selected_real_kind set_exponent shape shifta shiftl shiftr sign sin sind sinh size sizeof sngl snglq spacing spread sqrt storage_size sum system_clock tan tand tanh this_image tiny trailz transfer transpose trim ubound ucobound unpack verify cfi_deallocate rank ieee_round_type atomic_logical_kind error_unit character_kinds int64 ieee_negative_zero atomic_xor c_int real64 c_new_line ieee_get_modes ieee_get_status atomic_add atomic_fetch_add ieee_next_up c_long atomic_fetch_xor c_null_char c_double ieee_rint iso_fortran_env ieee_divide c_intptr_t int32 ieee_support_inf ieee_nan ieee_get_underflow_mode ieee_underflow ieee_support_underflow_control ieee_is_nan ieee_quiet_ne ieee_positive_zero ieee_positive_subnormal compiler_version numeric_storage_size ieee_inexact atomic_cas ieee_selected_real_kind ieee_support_denormal ieee_signbit atomic_or c_int_least64_t atomic_fetch_and c_int_fast8_t c_long_double_complex ieee_support_divide ieee_usual ieee_support_sqrt co_broadcast ieee_max_num_mag ieee_int ieee_signaling_nan logical_kinds ieee_get_flag c_int_least16_t int16 c_null_funptr cfi_establish current_team ieee_divide_by_zero co_reduce ieee_class c_vertical_tab c_intmax_t ieee_negative_inf co_sum stopped_images input_unit ieee_quiet_gt compiler_options ieee_support_io ieee_signaling_le ieee_signaling_gt ieee_next_after ieee_support_rounding ieee_min_num_mag output_unit ieee_support_nan ieee_real ieee_negative_normal ieee_is_normal iostat_inquire_internal_unit atomic_int_kind team_type iostat_end cfi_is_contiguous ieee_support_datatype ieee_away ieee_negative_subnormal ieee_value is_iostat_end ieee_logb ieee_is_negative is_iostat_eor c_ptr ieee_scalb failed_images character_storage_size c_int_fast32_t stat_locked_other_image atomic_fetch_or ieee_get_halting_mode real128 c_int8_t cfi_select_part c_long_long ieee_quiet_lt ieee_negative_denormal c_horizontal_tab real_kinds ieee_rem atomic_and ieee_positive_inf ieee_arithmetic team_number ieee_flag_type c_float ieee_next_down ieee_to_zero ieee_invalid_flag ieee_datatype reduce c_int_fast64_t cfi_address event_query c_int64_t ieee_sqrt c_char c_bool ieee_quiet_le ieee_min_num c_int_fast16_t ieee_max_num integer_kinds ieee_set_underflow_mode c_int32_t ieee_class_type real32 ieee_fma cfi_allocate get_team c_funptr ieee_nearest c_alert co_max parent_team ieee_status_type lock_type c_signed_char ieee_overflow ieee_invalid ieee_signaling_ge c_size_t c_float_complex ieee_set_modes cfi_cdesc_t ieee_get_rounding_mode image_status ieee_support_subnormal ieee_support_flag c_double_complex ieee_quiet_nan ieee_support_halting file_storage_size co_min c_long_double ieee_all c_short iostat_eor c_int_least8_t event_type ieee_unordered ieee_features ieee_quiet_eq stat_unlocked ieee_positive_normal stat_locked ieee_signaling_ne ieee_copy_sign ieee_halting ieee_signaling_eq cfi_setpointer ieee_inexact_flag ieee_is_finite ieee_set_status ieee_support_standard c_null_ptr ieee_inf stat_stopped_image c_carriage_return ieee_down ieee_subnormal ieee_quiet_ge cfi_section ieee_other ieee_positive_denormal ieee_set_flag ieee_rounding ieee_signaling_lt ieee_exceptions c_backspace ieee_set_rounding_mode ieee_up coshape ieee_denormal c_int_least32_t c_int16_t ieee_set_halting_mode ieee_features_type ieee_other_value initial_team stat_unlocked_failed_image stat_failed_image c_form_feed ieee_underflow_flag iso_c_binding random_init out_of_range ieee_modes_type user_functions=cdabs cdcos cdexp cdlog cdsin cdsqrt cotan cotand dcmplx dconjg dcotan dcotand decode dimag dll_export dll_import doublecomplex dreal dvchk encode find flen flush getarg getcharqq getcl getdat getenv gettim hfix ibchng identifier imag int1 int2 int4 intc intrup invalop iostat_msg isha ishc ishl jfix lacfar locking locnear map nargs nbreak ndperr ndpexc offset ovefl peekcharqq precfill prompt qabs qacos qacosd qasin qasind qatan qatand qatan2 qcmplx qconjg qcos qcosd qcosh qdim qexp qext qextd qfloat qimag qlog qlog10 qmax1 qmin1 qmod qreal qsign qsin qsind qsinh qsqrt qtan qtand qtanh ran rand randu rewrite segment setdat settim system timer undfl unlock union val virtual volatile zabs zcos zexp zlog zsin zsqrt diff --git a/data/geany.glade b/data/geany.glade index bace956482..98fc6b3ec7 100644 --- a/data/geany.glade +++ b/data/geany.glade @@ -4550,22 +4550,6 @@ 0 - - - Use fixed encoding when opening non-Unicode files - True - True - False - This option disables the automatic detection of the file encoding when opening non-Unicode files and opens the file with the specified encoding (usually not needed) - True - True - - - False - False - 1 - - True diff --git a/doc/geany.txt b/doc/geany.txt index 6e57f63723..b9c1ad4a2a 100644 --- a/doc/geany.txt +++ b/doc/geany.txt @@ -2473,12 +2473,12 @@ Open new documents from the command-line Default encoding (new files) The type of file encoding you wish to use when creating files. -Used fixed encoding when opening files - Assume all files you are opening are using the type of encoding specified below. - Default encoding (existing files) - Opens all files with the specified encoding instead of auto-detecting it. - Use this option when it's not possible for Geany to detect the exact encoding. + Selects the encoding used when opening existing files. If set to something + other than *Detect from file*, all files will be opened with the specified + encoding instead of auto-detecting it. + Use a specific encoding when it's not possible for Geany to detect the + correct one. Default end of line characters The end of line characters to which should be used for new files. diff --git a/meson.build b/meson.build index 13ee89714d..9e6634d188 100644 --- a/meson.build +++ b/meson.build @@ -168,7 +168,7 @@ cdata.set_quoted('REVISION', revision) python_command = get_option('python-command') if python_command == '' or python_command == 'auto' - if target_machine.system() == 'windows' + if host_machine.system() == 'windows' python_command = 'py' else python_command = 'python' @@ -213,7 +213,7 @@ endforeach geany_cflags = def_cflags have_gcc4_visibility = cc.has_argument('-fvisibility=hidden') geany_cflags += '-DGEANY_PRIVATE' -if target_machine.system() == 'windows' +if host_machine.system() == 'windows' geany_cflags += '-DGEANY_EXPORT_SYMBOL=__declspec(dllexport)' elif have_gcc4_visibility geany_cflags += '-fvisibility=hidden' @@ -660,6 +660,7 @@ ctags = static_library('ctags', 'ctags/parsers/markdown.h', 'ctags/parsers/nsis.c', 'ctags/parsers/objc.c', + 'ctags/parsers/ocaml.c', 'ctags/parsers/pascal.c', 'ctags/parsers/perl.c', 'ctags/parsers/perl.h', diff --git a/src/document.c b/src/document.c index 04574bd3ca..ff017e386c 100644 --- a/src/document.c +++ b/src/document.c @@ -998,19 +998,15 @@ static gboolean load_text_file(const gchar *locale_filename, const gchar *displa } if (! encodings_convert_to_utf8_auto(&filedata->data, &filedata->len, forced_enc, - &filedata->enc, &filedata->bom, &filedata->readonly)) + &filedata->enc, &filedata->bom, &filedata->readonly, &err)) { if (forced_enc) - { - ui_set_statusbar(TRUE, _("The file \"%s\" is not valid %s."), - display_filename, forced_enc); - } + ui_set_statusbar(TRUE, _("Failed to load file \"%s\" as %s: %s."), + display_filename, forced_enc, err->message); else - { - ui_set_statusbar(TRUE, - _("The file \"%s\" does not look like a text file or the file encoding is not supported."), - display_filename); - } + ui_set_statusbar(TRUE, _("Failed to load file \"%s\": %s."), + display_filename, err->message); + g_error_free(err); g_free(filedata->data); return FALSE; } diff --git a/src/encodings.c b/src/encodings.c index aab84d21c3..5a5f59bbaf 100644 --- a/src/encodings.c +++ b/src/encodings.c @@ -59,12 +59,24 @@ static gboolean pregs_loaded = FALSE; GeanyEncoding encodings[GEANY_ENCODINGS_MAX]; +static gboolean conversion_supported(const gchar *to, const gchar *from) +{ + GIConv conv = g_iconv_open(to, from); + if (conv == (GIConv) -1) + return FALSE; + + g_iconv_close(conv); + return TRUE; +} + + #define fill(Order, Group, Idx, Charset, Name) \ encodings[Idx].idx = Idx; \ encodings[Idx].order = Order; \ encodings[Idx].group = Group; \ encodings[Idx].charset = Charset; \ - encodings[Idx].name = Name; + encodings[Idx].name = Name; \ + encodings[Idx].supported = FALSE; static void init_encodings(void) { @@ -140,6 +152,19 @@ static void init_encodings(void) fill(14, EASTASIAN, GEANY_ENCODING_UHC, "UHC", _("Korean")); fill(0, NONE, GEANY_ENCODING_NONE, "None", _("Without encoding")); + + /* fill the flags member */ + for (guint i = 0; i < G_N_ELEMENTS(encodings); i++) + { + if (i == GEANY_ENCODING_NONE || conversion_supported("UTF-8", encodings[i].charset)) + encodings[i].supported = TRUE; + else + { + /* geany_debug() doesn't really work at this point, unless G_MESSAGES_DEBUG + * is set explicitly by the caller, but that's better than nothing */ + geany_debug("Encoding %s is not supported by the system", encodings[i].charset); + } + } } @@ -195,18 +220,13 @@ static gboolean encodings_charset_equals(const gchar *a, const gchar *b) GeanyEncodingIndex encodings_get_idx_from_charset(const gchar *charset) { - gint i; - if (charset == NULL) return GEANY_ENCODING_UTF_8; - i = 0; - while (i < GEANY_ENCODINGS_MAX) + for (gint i = 0; i < GEANY_ENCODINGS_MAX; i++) { if (encodings_charset_equals(charset, encodings[i].charset)) return i; - - ++i; } return GEANY_ENCODING_UTF_8; } @@ -214,18 +234,13 @@ GeanyEncodingIndex encodings_get_idx_from_charset(const gchar *charset) const GeanyEncoding *encodings_get_from_charset(const gchar *charset) { - gint i; - if (charset == NULL) return &encodings[GEANY_ENCODING_UTF_8]; - i = 0; - while (i < GEANY_ENCODINGS_MAX) + for (gint i = 0; i < GEANY_ENCODINGS_MAX; i++) { if (encodings_charset_equals(charset, encodings[i].charset)) return &encodings[i]; - - ++i; } return NULL; @@ -303,12 +318,10 @@ void encodings_select_radio_item(const gchar *charset) g_return_if_fail(charset != NULL); - i = 0; - while (i < GEANY_ENCODINGS_MAX) + for (i = 0; i < GEANY_ENCODINGS_MAX; i++) { if (utils_str_equal(charset, encodings[i].charset)) break; - i++; } if (i == GEANY_ENCODINGS_MAX) i = GEANY_ENCODING_UTF_8; /* fallback to UTF-8 */ @@ -326,7 +339,7 @@ void encodings_select_radio_item(const gchar *charset) static GRegex *regex_compile(const gchar *pattern) { GError *error = NULL; - GRegex *regex = g_regex_new(pattern, G_REGEX_CASELESS, 0, &error); + GRegex *regex = g_regex_new(pattern, G_REGEX_CASELESS | G_REGEX_RAW, 0, &error); if (!regex) { @@ -405,11 +418,31 @@ void encodings_finalize(void) } +/* initialization of non-UI parts */ +void encodings_init_headless(void) +{ + static gboolean initialized = FALSE; + + if (initialized) + return; + + init_encodings(); + + if (! pregs_loaded) + { + pregs[0] = regex_compile(PATTERN_HTMLMETA); + pregs[1] = regex_compile(PATTERN_CODING); + pregs_loaded = TRUE; + } + + initialized = TRUE; +} + + void encodings_init(void) { GtkWidget *menu[2]; GCallback cb_func[2]; - gint group_sizes[GEANY_ENCODING_GROUPS_MAX] = { 0 }; const gchar *const groups[GEANY_ENCODING_GROUPS_MAX] = { [NONE] = NULL, @@ -421,14 +454,7 @@ void encodings_init(void) [UNICODE] = N_("_Unicode"), }; - init_encodings(); - - if (! pregs_loaded) - { - pregs[0] = regex_compile(PATTERN_HTMLMETA); - pregs[1] = regex_compile(PATTERN_CODING); - pregs_loaded = TRUE; - } + encodings_init_headless(); /* create encodings submenu in document menu */ menu[0] = ui_lookup_widget(main_widgets.window, "set_encoding1_menu"); @@ -436,9 +462,6 @@ void encodings_init(void) cb_func[0] = G_CALLBACK(encodings_radio_item_change_cb); cb_func[1] = G_CALLBACK(encodings_reload_radio_item_change_cb); - for (guint i = 0; i < G_N_ELEMENTS(encodings); i++) - group_sizes[encodings[i].group]++; - for (guint k = 0; k < 2; k++) { GSList *group = NULL; @@ -478,7 +501,8 @@ void encodings_init(void) } else item = gtk_menu_item_new_with_label(label); - gtk_widget_show(item); + if (encodings[i].supported) + gtk_widget_show(item); gtk_container_add(GTK_CONTAINER(submenus[encodings[i].group]), item); g_signal_connect(item, "activate", cb_func[k], (gpointer) encodings[i].charset); @@ -550,6 +574,9 @@ GtkTreeStore *encodings_encoding_store_new(gboolean has_detect) { gchar *encoding_string; + if (! encodings[i].supported) + continue; + switch (encodings[i].group) { case WESTEUROPEAN: iter_parent = &iter_westeuro; break; @@ -612,21 +639,9 @@ void encodings_encoding_store_cell_data_func(GtkCellLayout *cell_layout, } -/** - * Tries to convert @a buffer into UTF-8 encoding from the encoding specified with @a charset. - * If @a fast is not set, additional checks to validate the converted string are performed. - * - * @param buffer The input string to convert. - * @param size The length of the string, or -1 if the string is nul-terminated. - * @param charset The charset to be used for conversion. - * @param fast @c TRUE to only convert the input and skip extended checks on the converted string. - * - * @return If the conversion was successful, a newly allocated nul-terminated string, - * which must be freed with @c g_free(). Otherwise @c NULL. - **/ -GEANY_API_SYMBOL -gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gssize size, - const gchar *charset, gboolean fast) +static gchar *convert_to_utf8_from_charset(const gchar *buffer, gssize size, + const gchar *charset, gboolean fast, + gsize *utf8_size, GError **error) { gchar *utf8_content = NULL; GError *conv_error = NULL; @@ -642,18 +657,22 @@ gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gssize size, if (fast) { utf8_content = converted_contents; - if (conv_error != NULL) g_error_free(conv_error); + if (conv_error != NULL) g_propagate_error(error, conv_error); } else if (conv_error != NULL || ! g_utf8_validate(converted_contents, bytes_written, NULL)) { if (conv_error != NULL) { geany_debug("Couldn't convert from %s to UTF-8 (%s).", charset, conv_error->message); - g_error_free(conv_error); + g_propagate_error(error, conv_error); conv_error = NULL; } else + { geany_debug("Couldn't convert from %s to UTF-8.", charset); + g_set_error(error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, + _("Data contains NULs")); + } utf8_content = NULL; g_free(converted_contents); @@ -664,10 +683,35 @@ gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gssize size, utf8_content = converted_contents; } + if (utf8_content && utf8_size) + *utf8_size = bytes_written; + return utf8_content; } +/** + * Tries to convert @a buffer into UTF-8 encoding from the encoding specified with @a charset. + * If @a fast is not set, additional checks to validate the converted string are performed. + * + * @param buffer The input string to convert. + * @param size The length of the string, or -1 if the string is nul-terminated. + * @param charset The charset to be used for conversion. + * @param fast @c TRUE to only convert the input and skip extended checks on the converted string. + * + * @return If the conversion was successful, a newly allocated nul-terminated string, + * which must be freed with @c g_free(). Otherwise @c NULL. + **/ +GEANY_API_SYMBOL +gchar *encodings_convert_to_utf8_from_charset(const gchar *buffer, gssize size, + const gchar *charset, gboolean fast) +{ + /* If fast=FALSE, we can safely ignore the size as the output cannot contain NULs. + * Otherwise, the caller already agrees on partial data anyway. */ + return convert_to_utf8_from_charset(buffer, size, charset, fast, NULL, NULL); +} + + static gchar *encodings_check_regexes(const gchar *buffer, gsize size) { guint i; @@ -684,7 +728,7 @@ static gchar *encodings_check_regexes(const gchar *buffer, gsize size) static gchar *encodings_convert_to_utf8_with_suggestion(const gchar *buffer, gssize size, - const gchar *suggested_charset, gchar **used_encoding) + const gchar *suggested_charset, gchar **used_encoding, gsize *utf8_size, GError **error) { const gchar *locale_charset = NULL; const gchar *charset; @@ -741,7 +785,7 @@ static gchar *encodings_convert_to_utf8_with_suggestion(const gchar *buffer, gss else continue; } - else if (i >= 0) + else if (i >= 0 && encodings[i].supported) charset = encodings[i].charset; else /* in this case we have i == -2, continue to increase i and go ahead */ continue; @@ -751,7 +795,7 @@ static gchar *encodings_convert_to_utf8_with_suggestion(const gchar *buffer, gss geany_debug("Trying to convert %" G_GSIZE_FORMAT " bytes of data from %s into UTF-8.", size, charset); - utf8_content = encodings_convert_to_utf8_from_charset(buffer, size, charset, FALSE); + utf8_content = convert_to_utf8_from_charset(buffer, size, charset, FALSE, utf8_size, NULL); if (G_LIKELY(utf8_content != NULL)) { @@ -768,6 +812,9 @@ static gchar *encodings_convert_to_utf8_with_suggestion(const gchar *buffer, gss } } + g_set_error(error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, + _("Data contains NULs or the encoding is not supported")); + return NULL; } @@ -791,7 +838,8 @@ gchar *encodings_convert_to_utf8(const gchar *buffer, gssize size, gchar **used_ /* first try to read the encoding from the file content */ regex_charset = encodings_check_regexes(buffer, size); - utf8 = encodings_convert_to_utf8_with_suggestion(buffer, size, regex_charset, used_encoding); + /* we know this cannot succeed if there are NULs in the output, so ignoring the size is OK */ + utf8 = encodings_convert_to_utf8_with_suggestion(buffer, size, regex_charset, used_encoding, NULL, NULL); g_free(regex_charset); return utf8; @@ -870,30 +918,30 @@ typedef struct { gchar *data; /* null-terminated data */ gsize size; /* actual data size */ - gsize len; /* string length of data */ gchar *enc; gboolean bom; - gboolean partial; } BufferData; /* convert data with the specified encoding */ static gboolean -handle_forced_encoding(BufferData *buffer, const gchar *forced_enc) +handle_forced_encoding(BufferData *buffer, const gchar *forced_enc, GError **error) { GeanyEncodingIndex enc_idx; if (utils_str_equal(forced_enc, "UTF-8")) { - if (! g_utf8_validate(buffer->data, buffer->len, NULL)) + if (! g_utf8_validate(buffer->data, buffer->size, NULL)) { + g_set_error(error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, + _("Data contains NULs or is not valid UTF-8")); return FALSE; } } else { - gchar *converted_text = encodings_convert_to_utf8_from_charset( - buffer->data, buffer->size, forced_enc, FALSE); + gchar *converted_text = convert_to_utf8_from_charset( + buffer->data, buffer->size, forced_enc, FALSE, &buffer->size, error); if (converted_text == NULL) { return FALSE; @@ -901,7 +949,6 @@ handle_forced_encoding(BufferData *buffer, const gchar *forced_enc) else { SETPTR(buffer->data, converted_text); - buffer->len = strlen(converted_text); } } enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL); @@ -913,15 +960,14 @@ handle_forced_encoding(BufferData *buffer, const gchar *forced_enc) /* detect encoding and convert to UTF-8 if necessary */ static gboolean -handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx) +handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx, GError **error) { g_return_val_if_fail(buffer->enc == NULL, FALSE); g_return_val_if_fail(buffer->bom == FALSE, FALSE); if (buffer->size == 0) { - /* we have no data so assume UTF-8, buffer->len can be 0 even we have an empty - * e.g. UTF32 file with a BOM(so size is 4, len is 0) */ + /* we have no data so assume UTF-8 */ buffer->enc = g_strdup("UTF-8"); } else @@ -932,14 +978,22 @@ handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx) buffer->enc = g_strdup(encodings[enc_idx].charset); buffer->bom = TRUE; - if (enc_idx != GEANY_ENCODING_UTF_8) /* the BOM indicated something else than UTF-8 */ + if (enc_idx == GEANY_ENCODING_UTF_8) + { + if (! g_utf8_validate(buffer->data, buffer->size, NULL)) + { + /* this is not actually valid UTF-8 */ + SETPTR(buffer->enc, NULL); + buffer->bom = FALSE; + } + } + else /* the BOM indicated something else than UTF-8 */ { - gchar *converted_text = encodings_convert_to_utf8_from_charset( - buffer->data, buffer->size, buffer->enc, FALSE); + gchar *converted_text = convert_to_utf8_from_charset( + buffer->data, buffer->size, buffer->enc, FALSE, &buffer->size, NULL); if (converted_text != NULL) { SETPTR(buffer->data, converted_text); - buffer->len = strlen(converted_text); } else { @@ -957,7 +1011,7 @@ handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx) /* try UTF-8 first */ if (encodings_get_idx_from_charset(regex_charset) == GEANY_ENCODING_UTF_8 && - (buffer->size == buffer->len) && g_utf8_validate(buffer->data, buffer->len, NULL)) + g_utf8_validate(buffer->data, buffer->size, NULL)) { buffer->enc = g_strdup("UTF-8"); } @@ -965,7 +1019,7 @@ handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx) { /* detect the encoding */ gchar *converted_text = encodings_convert_to_utf8_with_suggestion(buffer->data, - buffer->size, regex_charset, &buffer->enc); + buffer->size, regex_charset, &buffer->enc, &buffer->size, error); if (converted_text == NULL) { @@ -973,7 +1027,6 @@ handle_encoding(BufferData *buffer, GeanyEncodingIndex enc_idx) return FALSE; } SETPTR(buffer->data, converted_text); - buffer->len = strlen(converted_text); } g_free(regex_charset); } @@ -990,16 +1043,16 @@ handle_bom(BufferData *buffer) encodings_scan_unicode_bom(buffer->data, buffer->size, &bom_len); g_return_if_fail(bom_len != 0); - /* use filedata->len here because the contents are already converted into UTF-8 */ - buffer->len -= bom_len; + /* the contents are already converted into UTF-8 here */ + buffer->size -= bom_len; /* overwrite the BOM with the remainder of the file contents, plus the NULL terminator. */ - memmove(buffer->data, buffer->data + bom_len, buffer->len + 1); - buffer->data = g_realloc(buffer->data, buffer->len + 1); + memmove(buffer->data, buffer->data + bom_len, buffer->size + 1); + buffer->data = g_realloc(buffer->data, buffer->size + 1); } /* loads textfile data, verifies and converts to forced_enc or UTF-8. Also handles BOM. */ -static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc) +static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc, GError **error) { GeanyEncodingIndex tmp_enc_idx; @@ -1007,16 +1060,6 @@ static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc) * if we have a BOM */ tmp_enc_idx = encodings_scan_unicode_bom(buffer->data, buffer->size, NULL); - /* check whether the size of the loaded data is equal to the size of the file in the - * filesystem file size may be 0 to allow opening files in /proc/ which have typically a - * file size of 0 bytes */ - if (buffer->len != buffer->size && buffer->size != 0 && ( - tmp_enc_idx == GEANY_ENCODING_UTF_8 || /* tmp_enc_idx can be UTF-7/8/16/32, UCS and None */ - tmp_enc_idx == GEANY_ENCODING_UTF_7)) /* filter UTF-7/8 where no NULL bytes are allowed */ - { - buffer->partial = TRUE; - } - /* Determine character encoding and convert to UTF-8 */ if (forced_enc != NULL) { @@ -1026,12 +1069,12 @@ static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc) buffer->bom = FALSE; buffer->enc = g_strdup(encodings[GEANY_ENCODING_NONE].charset); } - else if (! handle_forced_encoding(buffer, forced_enc)) + else if (! handle_forced_encoding(buffer, forced_enc, error)) { return FALSE; } } - else if (! handle_encoding(buffer, tmp_enc_idx)) + else if (! handle_encoding(buffer, tmp_enc_idx, error)) { return FALSE; } @@ -1053,35 +1096,33 @@ static gboolean handle_buffer(BufferData *buffer, const gchar *forced_enc) * @param forced_enc forced encoding to use, or @c NULL * @param used_encoding return location for the actually used encoding, or @c NULL * @param has_bom return location to store whether the data had a BOM, or @c NULL - * @param partial return location to store whether the conversion may be partial, or @c NULL + * @param has_nuls return location to store whether the converted data contains NULs, or @c NULL * * @return @C TRUE if the conversion succeeded, @c FALSE otherwise. */ +GEANY_EXPORT_SYMBOL gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc, - gchar **used_encoding, gboolean *has_bom, gboolean *partial) + gchar **used_encoding, gboolean *has_bom, gboolean *has_nuls, GError **error) { BufferData buffer; buffer.data = *buf; buffer.size = *size; - /* use strlen to check for null chars */ - buffer.len = strlen(buffer.data); buffer.enc = NULL; buffer.bom = FALSE; - buffer.partial = FALSE; - if (! handle_buffer(&buffer, forced_enc)) + if (! handle_buffer(&buffer, forced_enc, error)) return FALSE; - *size = buffer.len; + *size = buffer.size; if (used_encoding) *used_encoding = buffer.enc; else g_free(buffer.enc); if (has_bom) *has_bom = buffer.bom; - if (partial) - *partial = buffer.partial; + if (has_nuls) + *has_nuls = strlen(buffer.data) != buffer.size; *buf = buffer.data; return TRUE; diff --git a/src/encodingsprivate.h b/src/encodingsprivate.h index 7baf5d9b9e..4affbb22eb 100644 --- a/src/encodingsprivate.h +++ b/src/encodingsprivate.h @@ -46,6 +46,7 @@ typedef struct GeanyEncoding GeanyEncodingGroup group; /* Internally used member for grouping */ const gchar *charset; /* String representation of the encoding, e.g. "ISO-8859-3" */ const gchar *name; /* Translatable and descriptive name of the encoding, e.g. "South European" */ + gboolean supported; /* Whether this encoding is supported on the system */ } GeanyEncoding; @@ -57,6 +58,7 @@ const gchar* encodings_get_charset(const GeanyEncoding* enc); void encodings_select_radio_item(const gchar *charset); +void encodings_init_headless(void); void encodings_init(void); void encodings_finalize(void); @@ -72,7 +74,8 @@ void encodings_encoding_store_cell_data_func(GtkCellLayout *cell_layout, GtkCell gboolean encodings_is_unicode_charset(const gchar *string); gboolean encodings_convert_to_utf8_auto(gchar **buf, gsize *size, const gchar *forced_enc, - gchar **used_encoding, gboolean *has_bom, gboolean *partial); + gchar **used_encoding, gboolean *has_bom, gboolean *has_nuls, + GError **error); GeanyEncodingIndex encodings_scan_unicode_bom(const gchar *string, gsize len, guint *bom_len); diff --git a/src/filetypes.c b/src/filetypes.c index b0a52a1eec..b1913e3689 100644 --- a/src/filetypes.c +++ b/src/filetypes.c @@ -138,7 +138,7 @@ static void init_builtin_filetypes(void) FT_INIT( FORTRAN, FORTRAN, "Fortran", "Fortran (F90)", SOURCE_FILE, COMPILED ); FT_INIT( F77, FORTRAN, "F77", "Fortran (F77)", SOURCE_FILE, COMPILED ); FT_INIT( GLSL, C, "GLSL", NULL, SOURCE_FILE, COMPILED ); - FT_INIT( CAML, NONE, "CAML", "(O)Caml", SOURCE_FILE, COMPILED ); + FT_INIT( CAML, OCAML, "CAML", "(O)Caml", SOURCE_FILE, COMPILED ); FT_INIT( PERL, PERL, "Perl", NULL, SOURCE_FILE, SCRIPT ); FT_INIT( PHP, PHP, "PHP", NULL, SOURCE_FILE, SCRIPT ); FT_INIT( JS, JAVASCRIPT, "Javascript", NULL, SOURCE_FILE, SCRIPT ); diff --git a/src/geanyentryaction.c b/src/geanyentryaction.c index e8f1a5e4fe..d167019642 100644 --- a/src/geanyentryaction.c +++ b/src/geanyentryaction.c @@ -42,7 +42,6 @@ struct _GeanyEntryActionPrivate { GtkWidget *entry; gboolean numeric; - gboolean connected; }; enum @@ -112,7 +111,7 @@ static void geany_entry_action_connect_proxy(GtkAction *action, GtkWidget *widge GeanyEntryActionPrivate *priv = GEANY_ENTRY_ACTION_GET_PRIVATE(action); /* make sure not to connect handlers twice */ - if (! priv->connected) + if (! g_object_get_data(G_OBJECT(widget), "gea-connected")) { if (priv->numeric) g_signal_connect(priv->entry, "insert-text", @@ -122,7 +121,7 @@ static void geany_entry_action_connect_proxy(GtkAction *action, GtkWidget *widge g_signal_connect(priv->entry, "activate-backward", G_CALLBACK(delegate_entry_activate_backward_cb), action); - priv->connected = TRUE; + g_object_set_data(G_OBJECT(widget), "gea-connected", action /* anything non-NULL */); } GTK_ACTION_CLASS(geany_entry_action_parent_class)->connect_proxy(action, widget); @@ -176,7 +175,6 @@ static void geany_entry_action_init(GeanyEntryAction *action) priv = action->priv; priv->entry = NULL; priv->numeric = FALSE; - priv->connected = FALSE; } diff --git a/src/libmain.c b/src/libmain.c index 0dce4763ab..0756e7bbd0 100644 --- a/src/libmain.c +++ b/src/libmain.c @@ -1033,6 +1033,8 @@ void main_init_headless(void) memset(&template_prefs, 0, sizeof(GeanyTemplatePrefs)); memset(&ui_prefs, 0, sizeof(UIPrefs)); memset(&ui_widgets, 0, sizeof(UIWidgets)); + + encodings_init_headless(); } diff --git a/src/prefs.c b/src/prefs.c index 40c23ccc52..cbb2c01c24 100644 --- a/src/prefs.c +++ b/src/prefs.c @@ -89,7 +89,6 @@ static void on_show_notebook_tabs_toggled(GtkToggleButton *togglebutton, gpointe static void on_enable_plugins_toggled(GtkToggleButton *togglebutton, gpointer user_data); static void on_use_folding_toggled(GtkToggleButton *togglebutton, gpointer user_data); static void on_check_line_end_toggled(GtkToggleButton *togglebutton, gpointer user_data); -static void on_open_encoding_toggled(GtkToggleButton *togglebutton, gpointer user_data); static void on_sidebar_visible_toggled(GtkToggleButton *togglebutton, gpointer user_data); static void on_prefs_print_radio_button_toggled(GtkToggleButton *togglebutton, gpointer user_data); static void on_prefs_print_page_header_toggled(GtkToggleButton *togglebutton, gpointer user_data); @@ -533,18 +532,11 @@ static void prefs_init_dialog(void) widget = ui_lookup_widget(ui_widgets.prefs_dialog, "combo_new_encoding"); ui_encodings_combo_box_set_active_encoding(GTK_COMBO_BOX(widget), file_prefs.default_new_encoding); - widget = ui_lookup_widget(ui_widgets.prefs_dialog, "check_open_encoding"); - gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(widget), - (file_prefs.default_open_encoding >= 0) ? TRUE : FALSE); - on_open_encoding_toggled(GTK_TOGGLE_BUTTON(widget), NULL); - widget = ui_lookup_widget(ui_widgets.prefs_dialog, "combo_open_encoding"); if (file_prefs.default_open_encoding >= 0) - { ui_encodings_combo_box_set_active_encoding(GTK_COMBO_BOX(widget), file_prefs.default_open_encoding); - } else - ui_encodings_combo_box_set_active_encoding(GTK_COMBO_BOX(widget), GEANY_ENCODING_UTF_8); + ui_encodings_combo_box_set_active_encoding(GTK_COMBO_BOX(widget), GEANY_ENCODINGS_MAX); widget = ui_lookup_widget(ui_widgets.prefs_dialog, "combo_eol"); if (file_prefs.default_eol_character >= 0 && file_prefs.default_eol_character < 3) @@ -1006,13 +998,9 @@ on_prefs_dialog_response(GtkDialog *dialog, gint response, gpointer user_data) widget = ui_lookup_widget(ui_widgets.prefs_dialog, "combo_new_encoding"); file_prefs.default_new_encoding = ui_encodings_combo_box_get_active_encoding(GTK_COMBO_BOX(widget)); - widget = ui_lookup_widget(ui_widgets.prefs_dialog, "check_open_encoding"); - if (gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(widget))) - { - widget = ui_lookup_widget(ui_widgets.prefs_dialog, "combo_open_encoding"); - file_prefs.default_open_encoding = ui_encodings_combo_box_get_active_encoding(GTK_COMBO_BOX(widget)); - } - else + widget = ui_lookup_widget(ui_widgets.prefs_dialog, "combo_open_encoding"); + file_prefs.default_open_encoding = ui_encodings_combo_box_get_active_encoding(GTK_COMBO_BOX(widget)); + if (file_prefs.default_open_encoding >= GEANY_ENCODINGS_MAX) file_prefs.default_open_encoding = -1; widget = ui_lookup_widget(ui_widgets.prefs_dialog, "combo_eol"); @@ -1547,15 +1535,6 @@ static void on_enable_plugins_toggled(GtkToggleButton *togglebutton, gpointer us } -static void on_open_encoding_toggled(GtkToggleButton *togglebutton, gpointer user_data) -{ - gboolean sens = gtk_toggle_button_get_active(togglebutton); - - gtk_widget_set_sensitive(ui_lookup_widget(ui_widgets.prefs_dialog, "eventbox3"), sens); - gtk_widget_set_sensitive(ui_lookup_widget(ui_widgets.prefs_dialog, "label_open_encoding"), sens); -} - - static void on_sidebar_visible_toggled(GtkToggleButton *togglebutton, gpointer user_data) { gboolean sens = gtk_toggle_button_get_active(togglebutton); @@ -1664,23 +1643,24 @@ void prefs_show_dialog(void) { struct { const gchar *combo, *renderer; + gboolean has_detect; } names[] = { - { "combo_new_encoding", "combo_new_encoding_renderer" }, - { "combo_open_encoding", "combo_open_encoding_renderer" } + { "combo_new_encoding", "combo_new_encoding_renderer", FALSE }, + { "combo_open_encoding", "combo_open_encoding_renderer", TRUE } }; guint i; - GtkTreeStore *encoding_list = encodings_encoding_store_new(FALSE); for (i = 0; i < G_N_ELEMENTS(names); i++) { + GtkTreeStore *encoding_list = encodings_encoding_store_new(names[i].has_detect); GtkWidget *combo = ui_lookup_widget(ui_widgets.prefs_dialog, names[i].combo); gtk_cell_layout_set_cell_data_func(GTK_CELL_LAYOUT(combo), ui_builder_get_object(names[i].renderer), encodings_encoding_store_cell_data_func, NULL, NULL); gtk_combo_box_set_model(GTK_COMBO_BOX(combo), GTK_TREE_MODEL(encoding_list)); + g_object_unref(encoding_list); } - g_object_unref(encoding_list); } /* init the eol character combo box */ @@ -1810,8 +1790,6 @@ void prefs_show_dialog(void) "toggled", G_CALLBACK(on_use_folding_toggled), NULL); g_signal_connect(ui_lookup_widget(ui_widgets.prefs_dialog, "check_line_end"), "toggled", G_CALLBACK(on_check_line_end_toggled), NULL); - g_signal_connect(ui_lookup_widget(ui_widgets.prefs_dialog, "check_open_encoding"), - "toggled", G_CALLBACK(on_open_encoding_toggled), NULL); g_signal_connect(ui_lookup_widget(ui_widgets.prefs_dialog, "check_sidebar_visible"), "toggled", G_CALLBACK(on_sidebar_visible_toggled), NULL); diff --git a/src/tagmanager/tm_ctags.c b/src/tagmanager/tm_ctags.c index 9a971b14fd..9cde83f317 100644 --- a/src/tagmanager/tm_ctags.c +++ b/src/tagmanager/tm_ctags.c @@ -241,6 +241,33 @@ void tm_ctags_clear_ignore_symbols(void) } +static gboolean replace_str(gchar **where, const gchar *what, guint what_len, + const gchar *replacement, guint replacement_len) +{ + if (where && *where) + { + gchar *pos = strstr(*where, what); + + if (pos) + { + gsize where_len = strlen(*where); + gchar *str = g_malloc(where_len + replacement_len - what_len + 1); + gsize prefix_len = (gsize) (pos - *where); + + strncpy(str, *where, prefix_len); + strcpy(str + prefix_len, replacement); + strcpy(str + prefix_len + replacement_len, pos + what_len); + g_free(*where); + *where = str; + + return TRUE; + } + } + + return FALSE; +} + + /* call after all tags have been collected so we don't have to handle reparses * with the counter (which gets complicated when also subparsers are involved) */ static void rename_anon_tags(TMSourceFile *source_file) @@ -336,7 +363,6 @@ static void rename_anon_tags(TMSourceFile *source_file) { TMTag *nested_tag = TM_TAG(source_file->tags_array->pdata[j]); guint nested_scope_len = nested_tag->scope ? strlen(nested_tag->scope) : 0; - gchar *pos; /* Tags can be interleaved with scopeless macros - skip those */ if (is_c && nested_tag->type & (tm_tag_macro_t | tm_tag_macro_with_arg_t)) @@ -361,22 +387,14 @@ static void rename_anon_tags(TMSourceFile *source_file) if (nested_scope_len <= scope_len) break; - pos = strstr(nested_tag->scope, orig_name); /* We found the parent name in the nested tag scope - replace it * with the new name. Note: anonymous tag names generated by * ctags are unique enough that we don't have to check for * scope separators here. */ - if (pos) - { - gchar *str = g_malloc(nested_scope_len + 50); - guint prefix_len = pos - nested_tag->scope; - - strncpy(str, nested_tag->scope, prefix_len); - strcpy(str + prefix_len, new_name); - strcpy(str + prefix_len + new_name_len, pos + orig_name_len); - g_free(nested_tag->scope); - nested_tag->scope = str; - } + replace_str(&nested_tag->scope, orig_name, orig_name_len, new_name, new_name_len); + + /* Do the same for var_type as well */ + replace_str(&nested_tag->var_type, orig_name, orig_name_len, new_name, new_name_len); } /* We are out of the nesting - the next tags could be variables @@ -385,22 +403,13 @@ static void rename_anon_tags(TMSourceFile *source_file) { TMTag *var_tag = TM_TAG(source_file->tags_array->pdata[j]); guint var_scope_len = var_tag->scope ? strlen(var_tag->scope) : 0; - gchar *pos; /* Should be at the same scope level as the anon tag */ - if (var_scope_len == scope_len && - var_tag->var_type && (pos = strstr(var_tag->var_type, orig_name))) + if (var_scope_len != scope_len || ! var_tag->var_type || + ! replace_str(&var_tag->var_type, orig_name, orig_name_len, new_name, new_name_len)) { - GString *str = g_string_new(var_tag->var_type); - gssize p = pos - var_tag->var_type; - g_string_erase(str, p, strlen(orig_name)); - g_string_insert(str, p, new_name); - g_free(var_tag->var_type); - var_tag->var_type = str->str; - g_string_free(str, FALSE); - } - else break; + } j++; } diff --git a/src/tagmanager/tm_parser.c b/src/tagmanager/tm_parser.c index c120b6117b..7ab7411b38 100644 --- a/src/tagmanager/tm_parser.c +++ b/src/tagmanager/tm_parser.c @@ -344,7 +344,7 @@ static TMParserMapGroup group_DOCBOOK[] = { static TMParserMapEntry map_ERLANG[] = { {'d', tm_tag_macro_t}, // macro {'f', tm_tag_function_t}, // function - {'m', tm_tag_undef_t}, // module + {'m', tm_tag_namespace_t}, // module {'r', tm_tag_struct_t}, // record {'t', tm_tag_typedef_t}, // type }; @@ -353,6 +353,7 @@ static TMParserMapGroup group_ERLANG[] = { {N_("Structs"), TM_ICON_STRUCT, tm_tag_struct_t}, {N_("Typedefs / Enums"), TM_ICON_STRUCT, tm_tag_typedef_t}, {N_("Macros"), TM_ICON_MACRO, tm_tag_macro_t}, + {N_("Module"), TM_ICON_NAMESPACE, tm_tag_namespace_t}, }; // no scope information @@ -1114,6 +1115,26 @@ static TMParserMapGroup group_RAKU[] = { {N_("Rules / Tokens"), TM_ICON_VAR, tm_tag_variable_t}, }; +static TMParserMapEntry map_OCAML[] = { + {'c', tm_tag_class_t}, // class + {'m', tm_tag_method_t}, // method + {'M', tm_tag_package_t}, // module + {'v', tm_tag_variable_t}, // var + {'p', tm_tag_undef_t}, // val + {'t', tm_tag_typedef_t}, // type + {'f', tm_tag_function_t}, // function + {'C', tm_tag_undef_t}, // Constructor + {'r', tm_tag_undef_t}, // RecordField + {'e', tm_tag_undef_t}, // Exception +}; +static TMParserMapGroup group_OCAML[] = { + {N_("Modules"), TM_ICON_NAMESPACE, tm_tag_package_t}, + {N_("Classes"), TM_ICON_CLASS, tm_tag_class_t}, + {N_("Types"), TM_ICON_STRUCT, tm_tag_typedef_t}, + {N_("Functions"), TM_ICON_METHOD, tm_tag_method_t | tm_tag_function_t}, + {N_("Variables"), TM_ICON_VAR, tm_tag_variable_t}, +}; + typedef struct { TMParserMapEntry *entries; @@ -1187,6 +1208,7 @@ static TMParserMap parser_map[] = { MAP_ENTRY(BATCH), MAP_ENTRY(AUTOIT), MAP_ENTRY(RAKU), + MAP_ENTRY(OCAML), }; /* make sure the parser map is consistent and complete */ G_STATIC_ASSERT(G_N_ELEMENTS(parser_map) == TM_PARSER_COUNT); @@ -1728,6 +1750,7 @@ gboolean tm_parser_has_full_scope(TMParserType lang) case TM_PARSER_ERLANG: case TM_PARSER_FORTRAN: case TM_PARSER_OBJC: + case TM_PARSER_OCAML: case TM_PARSER_REST: /* Other parsers don't use scope at all (or should be somewhere above) */ default: diff --git a/src/tagmanager/tm_parser.h b/src/tagmanager/tm_parser.h index 7672880932..043edc7713 100644 --- a/src/tagmanager/tm_parser.h +++ b/src/tagmanager/tm_parser.h @@ -120,6 +120,7 @@ enum TM_PARSER_BATCH, TM_PARSER_AUTOIT, TM_PARSER_RAKU, + TM_PARSER_OCAML, TM_PARSER_COUNT }; diff --git a/src/tagmanager/tm_parsers.h b/src/tagmanager/tm_parsers.h index 535a496ee1..808cd419a3 100644 --- a/src/tagmanager/tm_parsers.h +++ b/src/tagmanager/tm_parsers.h @@ -74,6 +74,7 @@ TypeScriptParser, \ DosBatchParser, \ AutoItParser, \ - Perl6Parser + Perl6Parser, \ + OcamlParser #endif diff --git a/src/templates.c b/src/templates.c index 5181ab97d0..80388f0578 100644 --- a/src/templates.c +++ b/src/templates.c @@ -70,15 +70,18 @@ static gchar *read_file(const gchar *locale_fname) gchar *contents; gsize length; GString *str; + GError *err = NULL; if (! g_file_get_contents(locale_fname, &contents, &length, NULL)) return NULL; - if (! encodings_convert_to_utf8_auto(&contents, &length, NULL, NULL, NULL, NULL)) + if (! encodings_convert_to_utf8_auto(&contents, &length, NULL, NULL, NULL, NULL, &err)) { gchar *utf8_fname = utils_get_utf8_from_locale(locale_fname); - ui_set_statusbar(TRUE, _("Failed to convert template file \"%s\" to UTF-8"), utf8_fname); + ui_set_statusbar(TRUE, _("Failed to convert template file \"%s\" to UTF-8: %s"), + utf8_fname, err->message); + g_error_free(err); g_free(utf8_fname); g_free(contents); return NULL; diff --git a/src/utils.c b/src/utils.c index c8f264d0da..1d3e377c80 100644 --- a/src/utils.c +++ b/src/utils.c @@ -766,21 +766,37 @@ gchar *utils_get_date_time(const gchar *format, time_t *time_to_use) } +/* Extracts initials from @p name, with basic Unicode support */ +GEANY_EXPORT_SYMBOL gchar *utils_get_initials(const gchar *name) { - gint i = 1, j = 1; - gchar *initials = g_malloc0(5); + GString *initials; + gchar *composed; + gboolean at_bound = TRUE; + + g_return_val_if_fail(name != NULL, NULL); + + composed = g_utf8_normalize(name, -1, G_NORMALIZE_ALL_COMPOSE); + g_return_val_if_fail(composed != NULL, NULL); - initials[0] = name[0]; - while (name[i] != '\0' && j < 4) + initials = g_string_new(NULL); + for (const gchar *p = composed; *p; p = g_utf8_next_char(p)) { - if (name[i] == ' ' && name[i + 1] != ' ') + gunichar ch = g_utf8_get_char(p); + + if (g_unichar_isspace(ch)) + at_bound = TRUE; + else if (at_bound) { - initials[j++] = name[i + 1]; + const gchar *end = g_utf8_next_char(p); + g_string_append_len(initials, p, end - p); + at_bound = FALSE; } - i++; } - return initials; + + g_free(composed); + + return g_string_free(initials, FALSE); } diff --git a/tests/Makefile.am b/tests/Makefile.am index 12bf58799c..d59068c57e 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -7,9 +7,10 @@ AM_CPPFLAGS += -I$(top_srcdir)/src/tagmanager -I$(top_srcdir)/src AM_CFLAGS = $(GTK_CFLAGS) AM_LDFLAGS = $(GTK_LIBS) $(INTLLIBS) -no-install -check_PROGRAMS = test_utils test_sidebar +check_PROGRAMS = test_utils test_sidebar test_encodings test_utils_LDADD = $(top_builddir)/src/libgeany.la test_sidebar_LDADD = $(top_builddir)/src/libgeany.la +test_encodings_LDADD = $(top_builddir)/src/libgeany.la TESTS = $(check_PROGRAMS) diff --git a/tests/ctags/Makefile.am b/tests/ctags/Makefile.am index 8bdea43729..8390acd2d8 100644 --- a/tests/ctags/Makefile.am +++ b/tests/ctags/Makefile.am @@ -252,6 +252,7 @@ test_sources = \ namespace.cpp \ namespaces2.php \ namespaces.php \ + nested_anon.c \ no_terminator.js \ non-ascii-ident1.php \ numlib.f90 \ @@ -313,6 +314,7 @@ test_sources = \ simple.lua \ simple.mak \ simple.md \ + simple.ml \ simple.php \ simple.pl \ simple.ps1 \ diff --git a/tests/ctags/maze.erl.tags b/tests/ctags/maze.erl.tags index 6e9bcf1400..a3c535afdb 100644 --- a/tests/ctags/maze.erl.tags +++ b/tests/ctags/maze.erl.tags @@ -2,6 +2,8 @@ build function: maze :: build generate16maze0 function: maze :: generate +maze2560 +namespace: maze pick16maze0 function: maze :: pick scramble16maze0 diff --git a/tests/ctags/nested_anon.c b/tests/ctags/nested_anon.c new file mode 100644 index 0000000000..0efb63f36a --- /dev/null +++ b/tests/ctags/nested_anon.c @@ -0,0 +1,5 @@ +typedef struct { + struct + { + } __value32; +} __atomic_wide_counter; diff --git a/tests/ctags/nested_anon.c.tags b/tests/ctags/nested_anon.c.tags new file mode 100644 index 0000000000..aef4a45c10 --- /dev/null +++ b/tests/ctags/nested_anon.c.tags @@ -0,0 +1,6 @@ +__atomic_wide_counter20480 +struct: __atomic_wide_counter +__value3264__atomic_wide_counter0__atomic_wide_counter::anon_struct_1 +member: __atomic_wide_counter::anon_struct_1 __atomic_wide_counter :: __value32 +anon_struct_12048__atomic_wide_counter1 +struct: __atomic_wide_counter :: anon_struct_1 flags: 1 diff --git a/tests/ctags/simple.ml b/tests/ctags/simple.ml new file mode 100644 index 0000000000..373f3a99c5 --- /dev/null +++ b/tests/ctags/simple.ml @@ -0,0 +1,35 @@ +module ModuleFoo = struct + type foobar = + ConstructorFoo + | ConstructorBar of int * char list +end + +type 'a foorecord = + { foofield : 'a; + barfield : int; + mutable foobarfield : list char -> int -> unit } + +(* op redif *) +let (+-) a b = + let aplus = a + b + and aminus = a - b + in + (aplus, aminus) + +let shall_appear () = + let sub_not 1 = 2 + and shall_not_either fu = () in + let nope = 3 +and must_appear_also 4 = () + +let y = 4 + +let foo_function a b = (a, b) + +class fooClass = +object (self) + val x = () + method fooMethod = x +end + +exception ConnectionNotReachable diff --git a/tests/ctags/simple.ml.tags b/tests/ctags/simple.ml.tags new file mode 100644 index 0000000000..6e4160efc1 --- /dev/null +++ b/tests/ctags/simple.ml.tags @@ -0,0 +1,20 @@ ++-160 +function: +- +ModuleFoo5120 +package: ModuleFoo +Simple5120 +package: Simple +fooClass10 +class: fooClass +fooMethod128fooClass0 +method: fooClass :: fooMethod +foo_function160 +function: foo_function +foobar4096ModuleFoo0 +typedef: ModuleFoo :: foobar +foorecord40960 +typedef: foorecord +shall_appear160 +function: shall_appear +y163840 +variable: y diff --git a/tests/ctags/test.erl.tags b/tests/ctags/test.erl.tags index 52b89fdd59..1f727c4f80 100644 --- a/tests/ctags/test.erl.tags +++ b/tests/ctags/test.erl.tags @@ -6,5 +6,7 @@ function1 function: test :: function1 record120480 struct: record1 +test2560 +namespace: test type140960 typedef: type1 diff --git a/tests/meson.build b/tests/meson.build index 74cc7b322a..f6c6dac567 100644 --- a/tests/meson.build +++ b/tests/meson.build @@ -249,6 +249,7 @@ ctags_tests = files([ 'ctags/namespace.cpp.tags', 'ctags/namespaces2.php.tags', 'ctags/namespaces.php.tags', + 'ctags/nested_anon.c.tags', 'ctags/no_terminator.js.tags', 'ctags/non-ascii-ident1.php.tags', 'ctags/numlib.f90.tags', @@ -310,6 +311,7 @@ ctags_tests = files([ 'ctags/simple.lua.tags', 'ctags/simple.mak.tags', 'ctags/simple.md.tags', + 'ctags/simple.ml.tags', 'ctags/simple.php.tags', 'ctags/simple.pl.tags', 'ctags/simple.ps1.tags', @@ -371,3 +373,4 @@ test('ctags/processing-order', runner, env: ['top_srcdir='+meson.source_root(), 'top_builddir='+meson.build_root()]) test('utils', executable('test_utils', 'test_utils.c', dependencies: test_deps)) test('sidebar', executable('test_sidebar', 'test_sidebar.c', dependencies: test_deps)) +test('encodings', executable('test_encodings', 'test_encodings.c', dependencies: test_deps)) diff --git a/tests/test_encodings.c b/tests/test_encodings.c new file mode 100644 index 0000000000..403ddcd7e4 --- /dev/null +++ b/tests/test_encodings.c @@ -0,0 +1,289 @@ +/* + * Copyright 2023 The Geany contributors + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "encodingsprivate.h" +#include "main.h" + + +/* Asserts 2 bytes buffers are identical, trying to provide a somewhat useful + * error if not. */ +static void assert_cmpmem_eq_impl(const char *p1, const char *p2, gsize len, + const char *domain, const char *file, int line, const char *func, + const char *expr) +{ + gchar *msg; + gsize i; + + for (i = 0; i < len && p1[i] == p2[i]; i++) + ; + if (i == len) + return; + + msg = g_strdup_printf("assertion failed (%s): bytes %#x and %#x differ at offset %lu (at \"%s\" and \"%s\")", + expr, (guint) (guchar) p1[i], (guint) (guchar) p2[i], i, p1 + i, p2 + i); + g_assertion_message(domain, file, line, func, msg); + g_free(msg); +} + +#define assert_cmpmem_eq_with_caller(p1, p2, len, domain, file, line, func) \ + assert_cmpmem_eq_impl(p1, p2, len, domain, file, line, func, #p1 " == " #p2) + +#define assert_cmpmem_eq(p1, p2, len) assert_cmpmem_eq_impl(p1, p2, len, \ + G_LOG_DOMAIN, __FILE__, __LINE__, G_STRFUNC, #p1 " == " #p2) + +/* + * @brief More convenient test API for encodings_convert_to_utf8_auto() + * @param input Input buffer, NUL-terminated (well, at least there should be a + * trailing NUL). + * @param input_size Actual size of @p input buffer, without the trailing NUL + * @param disk_size Size on disk (as reported by e.g stat -- that may be 0 for + * virtual files, otherwise should be input_size) + * @param forced_enc Forced encoding, or NULL + * @param expected_output Expected output data + * @param expected_size Expected output size + * @param expected_encoding Expected output encoding + * @param expected_has_bom Whether the input contains a BOM + * @param expected_partial Whether the output is expected to be truncated + * @returns Whether the conversion succeeded and followed the parameters + */ +static gboolean assert_convert_to_utf8_auto_impl( + const char *domain, const char *file, int line, const char *func, + const gchar *input, gsize input_size, + const gsize disk_size, const gchar *forced_enc, + const gchar *expected_output, gsize expected_size, const gchar *expected_encoding, + gboolean expected_has_bom, gboolean expected_partial) +{ + gchar *buf = g_memdup(input, input_size + 1); + gsize size = disk_size; + gchar *used_encoding = NULL; + gboolean has_bom = FALSE; + gboolean partial = FALSE; + gboolean ret; + GError *err = NULL; + + g_log(domain, G_LOG_LEVEL_INFO, "%s:%d:%s: converting %lu bytes", file, line, func, input_size); + ret = encodings_convert_to_utf8_auto(&buf, &size, forced_enc, &used_encoding, &has_bom, &partial, &err); + fflush(stdout); + if (! ret) + { + g_log(domain, G_LOG_LEVEL_INFO, "%s:%d:%s: conversion failed: %s", file, line, func, err->message); + g_error_free(err); + } + else + { + assert_cmpmem_eq_with_caller(buf, expected_output, MIN(size, expected_size), + domain, file, line, func); + g_assert_cmpuint(size, ==, expected_size); + if (expected_encoding) + g_assert_cmpstr(expected_encoding, ==, used_encoding); + g_assert_cmpint(has_bom, ==, expected_has_bom); + g_assert_cmpint(partial, ==, expected_partial); + + g_free(used_encoding); + } + + g_free(buf); + + return ret; +} + + +#define assert_convert_to_utf8_auto(input, input_size, disk_size, forced_enc, \ + expected_output, expected_size, expected_encoding, expected_has_bom, expected_partial) \ + assert_convert_to_utf8_auto_impl(G_LOG_DOMAIN, __FILE__, __LINE__, G_STRFUNC, \ + input, input_size, disk_size, forced_enc, \ + expected_output, expected_size, expected_encoding, expected_has_bom, expected_partial) + + +static void test_encodings_convert_ascii_to_utf8_auto(void) +{ +#define TEST_ASCII(success, str, forced_enc) \ + g_assert(success == assert_convert_to_utf8_auto(str, G_N_ELEMENTS(str) - 1, G_N_ELEMENTS(str) - 1, \ + forced_enc, str, G_N_ELEMENTS(str) - 1, forced_enc, FALSE, \ + strlen(str) != G_N_ELEMENTS(str) - 1)) + + TEST_ASCII(TRUE, "This is a very basic ASCII test", NULL); + TEST_ASCII(TRUE, "This is a very basic ASCII test", "None"); + TEST_ASCII(TRUE, "This is a very basic ASCII test", "ASCII"); + TEST_ASCII(TRUE, "This is a very basic ASCII test", "UTF-8"); + TEST_ASCII(TRUE, "S\till ve\ry \b\asic", NULL); + TEST_ASCII(FALSE, "With\0some\0NULs\0", NULL); + TEST_ASCII(TRUE, "With\0some\0NULs\0", "None"); + TEST_ASCII(FALSE, "With\0some\0NULs\0", "UTF-8"); + +#undef TEST_ASCII +} + + +static void test_encodings_convert_utf8_to_utf8_auto(void) +{ +#define UTF8_BOM "\xef\xbb\xbf" +#define TEST_UTF8(success, str, forced_enc) \ + G_STMT_START { \ + gboolean has_bom = strncmp(str, UTF8_BOM, 3) == 0; \ + g_assert(success == assert_convert_to_utf8_auto(str, G_N_ELEMENTS(str) - 1, G_N_ELEMENTS(str) - 1, \ + forced_enc, str + (has_bom ? 3 : 0), G_N_ELEMENTS(str) - 1 - (has_bom ? 3 : 0), \ + forced_enc, has_bom, strlen(str) != G_N_ELEMENTS(str) - 1)); \ + } G_STMT_END + + TEST_UTF8(TRUE, "Thĩs îs å véry basìč ÅSÇǏÍ test", NULL); + TEST_UTF8(TRUE, "Thĩs îs å véry basìč ÅSÇǏÍ test", "None"); + TEST_UTF8(TRUE, "Thĩs îs å véry basìč ÅSÇǏÍ test", "UTF-8"); + TEST_UTF8(FALSE, "Wíťh\0søme\0NÙLs\0", NULL); + TEST_UTF8(FALSE, "Wíťh\0søme\0NÙLs\0", "UTF-8"); /* the NUL doesn't pass the UTF-8 check */ + TEST_UTF8(TRUE, "Wíťh\0søme\0NÙLs\0", "None"); /* with None we do no data validation, but report partial output */ + + /* with the inline hint */ + TEST_UTF8(TRUE, "coding:utf-8 bãśïč", NULL); + TEST_UTF8(FALSE, "coding:utf-8 Wíťh\0søme\0NÙLs", NULL); + + TEST_UTF8(TRUE, UTF8_BOM"With BOM", NULL); + /* These won't pass the UTF-8 validation despite the BOM, so we fallback to + * testing other options, and it will succeed with UTF-16 so there's no real + * point in verifying this */ + /*TEST_UTF8(FALSE, UTF8_BOM"With BOM\0and NULs", NULL);*/ + /*TEST_UTF8(FALSE, UTF8_BOM"Wíth BØM\0añd NÙLs", NULL);*/ + + /* non-UTF-8 */ + TEST_UTF8(FALSE, "Th\xec""s", "UTF-8"); + TEST_UTF8(FALSE, "Th\xec""s\0", "UTF-8"); + TEST_UTF8(FALSE, "\0Th\xec""s", "UTF-8"); + +#undef TEST_UTF8 +#undef UTF8_BOM +} + + +static void test_encodings_convert_utf_other_to_utf8_auto(void) +{ +#define UTF16_LE_BOM "\xff\xfe" +#define UTF16_BE_BOM "\xfe\xff" +#define UTF32_LE_BOM "\xff\xfe\x00\x00" +#define UTF32_BE_BOM "\x00\x00\xfe\xff" +#define TEST_ENC(success, input, output, has_bom, forced_enc, expected_encoding) \ + g_assert(success == assert_convert_to_utf8_auto(input, G_N_ELEMENTS(input) - 1, G_N_ELEMENTS(input) - 1, \ + forced_enc, output, G_N_ELEMENTS(output) - 1, expected_encoding, has_bom, \ + strlen(output) != G_N_ELEMENTS(output) - 1)) +#define TEST(success, input, output, has_bom, forced_enc) \ + TEST_ENC(success, input, output, has_bom, forced_enc, forced_enc) + + TEST(TRUE, "N\000o\000 \000B\000O\000M\000", "No BOM", FALSE, NULL); + TEST(TRUE, "N\000o\000 \000B\000\330\000M\000", "No BØM", FALSE, NULL); + /* doesn't accept the NULs */ + TEST(FALSE, "N\000o\000 \000B\000O\000M\000\000\000a\000n\000d\000 \000N\000U\000L\000s\000", "No BOM\0and NULs", FALSE, NULL); + TEST(FALSE, "N\000o\000 \000B\000\330\000M\000\000\000a\000\361\000d\000 \000N\000\331\000L\000s\000", "No BØM\0añd NÙLs", FALSE, NULL); + + TEST(TRUE, UTF16_LE_BOM"W\000i\000t\000h\000 \000B\000O\000M\000", "With BOM", TRUE, NULL); + TEST(TRUE, UTF16_LE_BOM"W\000i\000t\000h\000 \000B\000\330\000M\000", "With BØM", TRUE, NULL); + /* doesn't accept the NULs */ + TEST(FALSE, UTF16_LE_BOM"W\000i\000t\000h\000 \000B\000O\000M\000\000\000a\000n\000d\000 \000N\000U\000L\000s\000", "With BOM\0and NULs", TRUE, NULL); + TEST(FALSE, UTF16_LE_BOM"W\000\355\000t\000h\000 \000B\000\330\000M\000\000\000a\000\361\000d\000 \000N\000\331\000L\000s\000", "Wíth BØM\0añd NÙLs", TRUE, NULL); + + /* We should actually be smarter in our selection of encoding introducing + * probability scores, because this loads as UTF-16LE but is "圀椀琀栀 䈀伀䴀" + * which doesn't seem to be real Chinese */ + TEST(TRUE, "\000N\000o\000 \000B\000O\000M", "No BOM", FALSE, "UTF-16BE"); + TEST(TRUE, "\000N\000o\000 \000B\000\330\000M", "No BØM", FALSE, NULL); + /* doesn't accept the NULs -- and see above for the encoding choice */ + TEST(FALSE, "\000N\000o\000 \000B\000O\000M\000\000\000a\000n\000d\000 \000N\000U\000L\000s", "No BOM\0and NULs", FALSE, "UTF-16BE"); + TEST(FALSE, "\000N\000o\000 \000B\000\330\000M\000\000\000a\000\361\000d\000 \000N\000\331\000L\000s", "No BØM\0añd NÙLs", FALSE, NULL); + + TEST(TRUE, UTF16_BE_BOM"\000W\000i\000t\000h\000 \000B\000O\000M", "With BOM", TRUE, NULL); + TEST(TRUE, UTF16_BE_BOM"\000W\000i\000t\000h\000 \000B\000\330\000M", "With BØM", TRUE, NULL); + /* doesn't accept the NULs */ + TEST(FALSE, UTF16_BE_BOM"\000W\000i\000t\000h\000 \000B\000O\000M\000\000\000a\000n\000d\000 \000N\000U\000L\000s", "With BOM\0and NULs", TRUE, NULL); + TEST(FALSE, UTF16_BE_BOM"\000W\000\355\000t\000h\000 \000B\000\330\000M\000\000\000a\000\361\000d\000 \000N\000\331\000L\000s", "Wíth BØM\0añd NÙLs", TRUE, NULL); + + TEST(TRUE, UTF32_LE_BOM"W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000O\000\000\000M\000\000\000", "With BOM", TRUE, NULL); + TEST(TRUE, UTF32_LE_BOM"W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000\330\000\000\000M\000\000\000", "With BØM", TRUE, NULL); + /* doesn't accept the NULs */ + TEST(FALSE, UTF32_LE_BOM"W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000O\000\000\000M\000\000\000\000\000\000\000a\000\000\000n\000\000\000d\000\000\000 \000\000\000N\000\000\000U\000\000\000L\000\000\000s\000\000\000", "With BOM\0and NULs", TRUE, NULL); + TEST(FALSE, UTF32_LE_BOM"W\000\000\000\355\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000\330\000\000\000M\000\000\000\000\000\000\000a\000\000\000\361\000\000\000d\000\000\000 \000\000\000N\000\000\000\331\000\000\000L\000\000\000s\000\000\000", "Wíth BØM\0añd NÙLs", TRUE, NULL); + + TEST(TRUE, UTF32_BE_BOM"\000\000\000W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000O\000\000\000M", "With BOM", TRUE, NULL); + TEST(TRUE, UTF32_BE_BOM"\000\000\000W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000\330\000\000\000M", "With BØM", TRUE, NULL); + /* doesn't accept the NULs */ + TEST(FALSE, UTF32_BE_BOM"\000\000\000W\000\000\000i\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000O\000\000\000M\000\000\000\000\000\000\000a\000\000\000n\000\000\000d\000\000\000 \000\000\000N\000\000\000U\000\000\000L\000\000\000s", "With BOM\0and NULs", TRUE, NULL); + TEST(FALSE, UTF32_BE_BOM"\000\000\000W\000\000\000\355\000\000\000t\000\000\000h\000\000\000 \000\000\000B\000\000\000\330\000\000\000M\000\000\000\000\000\000\000a\000\000\000\361\000\000\000d\000\000\000 \000\000\000N\000\000\000\331\000\000\000L\000\000\000s", "Wíth BØM\0añd NÙLs", TRUE, NULL); + + /* meh, UTF-7 */ + TEST(TRUE, "No B+ANg-M", "No BØM", FALSE, "UTF-7"); + TEST(TRUE, "+/v8-With B+ANg-M", "With BØM", TRUE, NULL); + TEST(FALSE, "No B+ANg-M+AAA-but NULs", "No BØM\0but NULs", FALSE, "UTF-7"); + /* Fails to load as UTF-7 because of the NUL, and succeeds as UTF-8 but + * obviously doesn't match expectations */ + /*TEST(FALSE, "+/v8-With B+ANg-M+AAA-and NULs", "With BØM\0and NULs", TRUE, NULL);*/ + + /* empty data with BOMs */ + TEST_ENC(TRUE, "+/v8-", "", TRUE, NULL, "UTF-7"); /* UTF-7 */ + TEST_ENC(TRUE, UTF16_BE_BOM, "", TRUE, NULL, "UTF-16BE"); + TEST_ENC(TRUE, UTF16_LE_BOM, "", TRUE, NULL, "UTF-16LE"); + TEST_ENC(TRUE, UTF32_BE_BOM, "", TRUE, NULL, "UTF-32BE"); + TEST_ENC(TRUE, UTF32_LE_BOM, "", TRUE, NULL, "UTF-32LE"); + +#undef TEST +#undef TEST_ENC +#undef UTF32_BE_BOM +#undef UTF32_LE_BOM +#undef UTF16_BE_BOM +#undef UTF16_LE_BOM +} + + +static void test_encodings_convert_iso8859_to_utf8_auto(void) +{ +#define TEST(success, input, output, forced_enc) \ + g_assert(success == assert_convert_to_utf8_auto(input, G_N_ELEMENTS(input) - 1, G_N_ELEMENTS(input) - 1, \ + forced_enc, output, G_N_ELEMENTS(output) - 1, forced_enc, FALSE, \ + strlen(output) != G_N_ELEMENTS(output) - 1)) + + TEST(TRUE, "Th\xec""s", "Thìs", NULL); + TEST(TRUE, "Th\xec""s", "Thìs", "ISO-8859-1"); + TEST(TRUE, "Th\xec""s", "Thìs", "ISO-8859-15"); + TEST(TRUE, "\xa4""uro", "¤uro", "ISO-8859-1"); + TEST(TRUE, "\xa4""uro", "€uro", "ISO-8859-15"); + TEST(TRUE, "\xd8""ed", "Řed", "ISO-8859-2"); + /* make-believe UTF-8 BOM followed by non-UTF-8 data */ + TEST(TRUE, "\xef\xbb\xbf""not B\xd3M", "not BÓM", NULL); + TEST(TRUE, "coding:iso-8859-2 \xd8""ed", "coding:iso-8859-2 Řed", NULL); + /* with NULs */ + TEST(FALSE, "W\xec""th\0z\xe9""r\xf8""s", "Wìth\0zérøs", "ISO-8859-1"); + TEST(FALSE, "W\xec""th\0z\xe9""r\xf8""s", "Wìth\0zérøs", "ISO-8859-15"); + /* This parses as UTF-16, but that's not really what we'd expect */ + /*TEST(FALSE, "W\xec""th\0z\xe9""r\xf8""s", "Wìth\0zérøs", NULL);*/ + + /* UTF-8 BOM with non-UTF-8 data, we should fallback */ + TEST(TRUE, "\xef\xbb\xbfW\xec""th\xf8""ut BOM", "Wìthøut BOM", NULL); + +#undef TEST +} + + +int main(int argc, char **argv) +{ + g_test_init(&argc, &argv, NULL); + gtk_init_check(&argc, &argv); + main_init_headless(); + + g_test_add_func("/encodings/ascii/convert_to_utf8_auto", test_encodings_convert_ascii_to_utf8_auto); + g_test_add_func("/encodings/utf8/convert_to_utf8_auto", test_encodings_convert_utf8_to_utf8_auto); + g_test_add_func("/encodings/utf_other/convert_to_utf_other_auto", test_encodings_convert_utf_other_to_utf8_auto); + g_test_add_func("/encodings/iso8859/convert_to_utf8_auto", test_encodings_convert_iso8859_to_utf8_auto); + + return g_test_run(); +} diff --git a/tests/test_sidebar.c b/tests/test_sidebar.c index 9659dc72e7..a1a470497e 100644 --- a/tests/test_sidebar.c +++ b/tests/test_sidebar.c @@ -52,7 +52,7 @@ static gboolean tree_strings_cb(GtkTreeModel *model, GtkTreePath *path, return FALSE; } -void do_test_sidebar_openfiles(const gchar **test_data, const gchar **expected) +static void do_test_sidebar_openfiles(const gchar **test_data, const gchar **expected) { #ifdef HAVE_G_STRV_EQUAL int count = 0; @@ -72,7 +72,7 @@ void do_test_sidebar_openfiles(const gchar **test_data, const gchar **expected) #endif } -void test_sidebar_openfiles_none(void) +static void test_sidebar_openfiles_none(void) { const gchar *files[] = { "/tmp/x", @@ -92,7 +92,7 @@ void test_sidebar_openfiles_none(void) } -void test_sidebar_openfiles_path(void) +static void test_sidebar_openfiles_path(void) { const gchar *files[] = { "/tmp/x", @@ -114,7 +114,7 @@ void test_sidebar_openfiles_path(void) } -void test_sidebar_openfiles_tree(void) +static void test_sidebar_openfiles_tree(void) { const gchar *files[] = { "/tmp/x", diff --git a/tests/test_utils.c b/tests/test_utils.c index ea9a307f15..149801be21 100644 --- a/tests/test_utils.c +++ b/tests/test_utils.c @@ -107,7 +107,7 @@ static void test_utils_strv_find_common_prefix(void) } #define DIR_SEP "\\/" -void test_utils_strv_find_lcs(void) +static void test_utils_strv_find_lcs(void) { gchar **data, *s; @@ -291,7 +291,7 @@ static gboolean strv_eq(gchar **strv1, gchar **strv2) } } -void test_utils_strv_shorten_file_list(void) +static void test_utils_strv_shorten_file_list(void) { gchar **data, **expected, **result; gchar *empty[] = { NULL }; @@ -408,6 +408,28 @@ void test_utils_strv_shorten_file_list(void) g_strfreev(data); } +static void test_utils_get_initials(void) +{ +#define CHECK_INITIALS(buf, initials) \ + G_STMT_START { \ + gchar *r = utils_get_initials(buf); \ + g_assert_cmpstr(r, ==, initials); \ + g_free(r); \ + } G_STMT_END + + CHECK_INITIALS("John Doe", "JD"); + CHECK_INITIALS(" John Doe ", "JD"); + CHECK_INITIALS("John", "J"); + CHECK_INITIALS("John F. Doe", "JFD"); + CHECK_INITIALS("Gary Errol Anthony Nicholas Yales", "GEANY"); + CHECK_INITIALS("", ""); + CHECK_INITIALS("Åsa Åkesson", "ÅÅ"); /* composed */ + CHECK_INITIALS("Åsa Åkesson", "ÅÅ"); /* decomposed */ + CHECK_INITIALS("Œdipe", "Œ"); + +#undef CHECK_INITIALS +} + int main(int argc, char **argv) { g_test_init(&argc, &argv, NULL); @@ -416,6 +438,7 @@ int main(int argc, char **argv) UTIL_TEST_ADD("strv_find_common_prefix", test_utils_strv_find_common_prefix); UTIL_TEST_ADD("strv_find_lcs", test_utils_strv_find_lcs); UTIL_TEST_ADD("strv_shorten_file_list", test_utils_strv_shorten_file_list); + UTIL_TEST_ADD("get_initals", test_utils_get_initials); return g_test_run(); }