feat: Add doxygen comment parser.

This one is very strict and most comments in toxcore don't comply and need to be fixed.
TokTok · Mar 5, 2022 · a55da30 · a55da30
1 parent e404717
commit a55da30
Show file tree

Hide file tree

Showing 15 changed files with 607 additions and 71 deletions.
diff --git a/BUILD.bazel b/BUILD.bazel
@@ -16,6 +16,7 @@ haskell_library(
     name = "lexer",
     srcs = [
         "src/Language/Cimple/Lexer.hs",
+        "src/Language/Cimple/ParseResult.hs",
         "src/Language/Cimple/Tokens.hs",
     ],
     src_strip_prefix = "src",
@@ -33,7 +34,6 @@ haskell_library(
     src_strip_prefix = "src",
     visibility = ["//hs-cimple:__subpackages__"],
     deps = [
-        ":lexer",
         hazel_library("aeson"),
         hazel_library("base"),
         hazel_library("data-fix"),
@@ -81,6 +81,28 @@ haskell_library(
     ],
 )
 
+happy_parser(
+    name = "CommentParser",
+    src = "src/Language/Cimple/CommentParser.y",
+    preproc = "expand_yacc.pl",
+)
+
+haskell_library(
+    name = "comment-parser",
+    srcs = [":CommentParser"],
+    src_strip_prefix = "src",
+    visibility = ["//hs-cimple:__subpackages__"],
+    deps = [
+        ":ast",
+        ":describe-ast",
+        ":lexer",
+        hazel_library("array"),
+        hazel_library("base"),
+        hazel_library("data-fix"),
+        hazel_library("text"),
+    ],
+)
+
 happy_parser(
     name = "TreeParser",
     src = "src/Language/Cimple/TreeParser.y",
@@ -94,6 +116,7 @@ haskell_library(
     visibility = ["//hs-cimple:__subpackages__"],
     deps = [
         ":ast",
+        ":comment-parser",
         ":describe-ast",
         ":lexer",
         hazel_library("array"),
@@ -111,6 +134,7 @@ haskell_library(
             "src/Language/Cimple/Ast.hs",
             "src/Language/Cimple/DescribeAst.hs",
             "src/Language/Cimple/Flatten.hs",
+            "src/Language/Cimple/ParseResult.hs",
             "src/Language/Cimple/Tokens.hs",
         ],
     ),

diff --git a/cimple.cabal b/cimple.cabal
@@ -36,10 +36,12 @@ library
   other-modules:
       Language.Cimple.Annot
     , Language.Cimple.Ast
+    , Language.Cimple.CommentParser
     , Language.Cimple.DescribeAst
     , Language.Cimple.Flatten
     , Language.Cimple.Graph
     , Language.Cimple.Lexer
+    , Language.Cimple.ParseResult
     , Language.Cimple.Parser
     , Language.Cimple.SemCheck.Includes
     , Language.Cimple.Tokens

diff --git a/src/Language/Cimple/Ast.hs b/src/Language/Cimple/Ast.hs
@@ -13,6 +13,8 @@ module Language.Cimple.Ast
     , Node, NodeF (..)
     , Scope (..)
     , CommentStyle (..)
+    , Comment
+    , CommentF (..)
     ) where
 
 import           Data.Aeson                   (FromJSON, FromJSON1, ToJSON,
@@ -47,6 +49,7 @@ data NodeF lexeme a
     | CommentSection a [a] a
     | CommentSectionEnd lexeme
     | Commented a a
+    | CommentInfo (Comment lexeme)
     -- Namespace-like blocks
     | ExternC [a]
     -- An inferred coherent block of nodes, printed without empty lines
@@ -126,6 +129,40 @@ type Node lexeme = Fix (NodeF lexeme)
 instance FromJSON lexeme => FromJSON1 (NodeF lexeme)
 instance ToJSON lexeme => ToJSON1 (NodeF lexeme)
 
+data CommentF lexeme a
+    = DocComment [a]
+    | DocWord lexeme
+    | DocSentence [a] lexeme
+    | DocNewline
+
+    | DocBrief [a]
+    | DocDeprecated [a]
+    | DocParam (Maybe lexeme) lexeme [a]
+    | DocReturn [a]
+    | DocRetval [lexeme] [a]
+    | DocSee lexeme [a]
+
+    | DocLine [a]
+    | DocBullet [a] [a]
+    | DocBulletList [a]
+
+    | DocColon lexeme
+    | DocRef lexeme
+    | DocP lexeme
+    | DocLParen a
+    | DocRParen a
+    | DocAssignOp AssignOp lexeme lexeme
+    | DocBinaryOp BinaryOp lexeme lexeme
+    | DocMinus lexeme a
+    | DocSlash lexeme a
+    deriving (Show, Read, Eq, Ord, Generic, Generic1, Functor, Foldable, Traversable)
+    deriving (Show1, Read1, Eq1, Ord1) via FunctorClassesDefault (CommentF lexeme)
+
+type Comment lexeme = Fix (CommentF lexeme)
+
+instance FromJSON lexeme => FromJSON1 (CommentF lexeme)
+instance ToJSON lexeme => ToJSON1 (CommentF lexeme)
+
 data AssignOp
     = AopEq
     | AopMul

diff --git a/src/Language/Cimple/CommentParser.y b/src/Language/Cimple/CommentParser.y
@@ -0,0 +1,201 @@
+{
+{-# LANGUAGE FlexibleInstances #-}
+{-# LANGUAGE TypeSynonymInstances #-}
+{-# LANGUAGE OverloadedStrings #-}
+module Language.Cimple.CommentParser
+    ( parseComment
+    ) where
+
+import           Data.Fix                    (Fix (..))
+import           Data.Text                   (Text)
+import qualified Data.Text                   as Text
+
+import           Language.Cimple.Ast         (AssignOp (..), BinaryOp (..),
+                                              Comment, CommentF (..), Node,
+                                              NodeF (..))
+import           Language.Cimple.DescribeAst (describeLexeme, sloc)
+import           Language.Cimple.Lexer       (Lexeme (..))
+import           Language.Cimple.ParseResult (ParseResult)
+import           Language.Cimple.Tokens      (LexemeClass (..))
+}
+
+%name parseComment Comment
+
+%error {parseError}
+%errorhandlertype explist
+%monad {ParseResult}
+%tokentype {Term}
+%token
+    '@brief'			{ L _ CmtCommand "@brief"	}
+    '@deprecated'		{ L _ CmtCommand "@deprecated"	}
+    '@implements'		{ L _ CmtCommand "@implements"	}
+    '@param'			{ L _ CmtCommand "@param"	}
+    '@ref'			{ L _ CmtCommand "@ref"		}
+    '@p'			{ L _ CmtCommand "@p"		}
+    '@return'			{ L _ CmtCommand "@return"	}
+    '@retval'			{ L _ CmtCommand "@retval"	}
+    '@see'			{ L _ CmtCommand "@see"		}
+
+    ' '				{ L _ CmtIndent		" "	}
+    'INDENT1'			{ L _ CmtIndent		"   "	}
+    'INDENT2'			{ L _ CmtIndent		"     "	}
+
+    '('				{ L _ PctLParen			_ }
+    ')'				{ L _ PctRParen			_ }
+    ','				{ L _ PctComma			_ }
+    ':'				{ L _ PctColon			_ }
+    '/'				{ L _ PctSlash			_ }
+    '='				{ L _ PctEq			_ }
+    '=='			{ L _ PctEqEq			_ }
+    '!='			{ L _ PctEMarkEq		_ }
+    '>='			{ L _ PctGreaterEq		_ }
+    ';'				{ L _ PctSemicolon		_ }
+    '.'				{ L _ PctPeriod			_ }
+    '...'			{ L _ PctEllipsis		_ }
+    '?'				{ L _ PctQMark			_ }
+    '!'				{ L _ PctEMark			_ }
+    '-'				{ L _ PctMinus			_ }
+    '\n'			{ L _ PpNewline			_ }
+    '/**'			{ L _ CmtStartDoc		_ }
+    '*/'			{ L _ CmtEnd			_ }
+    LIT_INTEGER			{ L _ LitInteger		_ }
+    LIT_STRING			{ L _ LitString			_ }
+    CMT_ATTR			{ L _ CmtAttr			_ }
+    CMT_CODE			{ L _ CmtCode			_ }
+    CMT_WORD			{ L _ CmtWord			_ }
+    CMT_REF			{ L _ CmtRef			_ }
+
+%left '.' '?' ',' ';'
+%left '-'
+%left '/'
+%left '(' ')'
+%right NEG
+
+%%
+
+Comment :: { NonTerm }
+Comment
+:	'/**' '\n' Blocks '*/'					{ Fix $ DocComment $3 }
+|	'/**' WordsWithoutNewlines '*/'				{ Fix $ DocComment [Fix $ DocLine $2] }
+|	'/**' Command(WordsWithoutNewlines) '*/'		{ Fix $ DocComment [$2] }
+|	'/**' Command(WordsWithoutNewlines) '\n' Blocks '*/'	{ Fix $ DocComment ($2 : $4) }
+
+Blocks :: { [NonTerm] }
+Blocks
+:	BlockList						{ reverse $1 }
+
+BlockList :: { [NonTerm] }
+BlockList
+:	Block							{ [$1] }
+|	BlockList Block						{ $2 : $1 }
+
+Block :: { NonTerm }
+Block
+:	'\n'							{ Fix DocNewline }
+|	' ' Command(IndentedSentence)				{ $2 }
+|	' ' WordsWithoutNewlines '\n'				{ Fix $ DocLine $2 }
+|	BulletListI						{ Fix $ DocBulletList (reverse $1) }
+
+Command(x)
+:	'@brief' x						{ Fix $ DocBrief $2 }
+|	'@param' CMT_WORD x					{ Fix $ DocParam Nothing $2 $3 }
+|	'@param' CMT_ATTR CMT_WORD x				{ Fix $ DocParam (Just $2) $3 $4 }
+|	'@retval' ConstExpr x					{ Fix $ DocRetval $2 $3 }
+|	'@return' x						{ Fix $ DocReturn $2 }
+|	'@return' '\n' BulletListII				{ Fix $ DocReturn (Fix (DocLine []) : $3) }
+|	'@see' CMT_WORD x					{ Fix $ DocSee $2 $3 }
+|	'@deprecated' x						{ Fix $ DocDeprecated $2 }
+
+BulletListI :: { [NonTerm] }
+BulletListI
+:	BulletI							{ [$1] }
+|	BulletListI BulletI					{ $2 : $1 }
+
+BulletI :: { NonTerm }
+BulletI
+:	' ' '-' WordsWithoutNewlines '\n' BulletListII		{ Fix $ DocBullet $3 (reverse $5) }
+
+BulletListII :: { [NonTerm] }
+BulletListII
+:								{ [] }
+|	BulletListII BulletII					{ $2 : $1 }
+
+BulletII :: { NonTerm }
+BulletII
+:	'INDENT1' '-' WordsWithoutNewlines '\n' BulletIIConts	{ Fix $ DocBullet ($3 ++ $5) [] }
+
+BulletIIConts :: { [NonTerm] }
+BulletIIConts
+:								{ [] }
+|	BulletIIConts BulletIICont				{ $1 ++ $2 }
+
+BulletIICont :: { [NonTerm] }
+BulletIICont
+:	'INDENT2' WordsWithoutNewlines '\n'			{ $2 }
+
+IndentedSentence :: { [NonTerm] }
+IndentedSentence
+:	WordsWithoutNewlines '\n'				{ [Fix $ DocLine $1] }
+|	WordsWithoutNewlines '\n' 'INDENT1' IndentedSentence	{ Fix (DocLine $1) : $4 }
+
+WordsWithoutNewlines :: { [NonTerm] }
+WordsWithoutNewlines
+:	SentenceList(WordList)					{ $1 }
+
+WordList :: { [NonTerm] }
+WordList
+:	Word							{ [$1] }
+|	WordList Word						{ $2 : $1 }
+
+SentenceList(x)
+:	x							{ reverse $1 }
+|	SentenceList(x) ';'					{ [Fix (DocSentence $1 $2)] }
+|	SentenceList(x) ','					{ [Fix (DocSentence $1 $2)] }
+|	SentenceList(x) '.'					{ [Fix (DocSentence $1 $2)] }
+|	SentenceList(x) '?'					{ [Fix (DocSentence $1 $2)] }
+|	SentenceList(x) ';' SentenceList(x)			{ Fix (DocSentence $1 $2) : $3 }
+|	SentenceList(x) ',' SentenceList(x)			{ Fix (DocSentence $1 $2) : $3 }
+|	SentenceList(x) '.' SentenceList(x)			{ Fix (DocSentence $1 $2) : $3 }
+|	SentenceList(x) '?' SentenceList(x)			{ Fix (DocSentence $1 $2) : $3 }
+
+Word :: { NonTerm }
+Word
+:	Atom							{ Fix $ DocWord $1 }
+|	Atom ':'						{ Fix $ DocColon $1 }
+|	Atom '-' Word						{ Fix $ DocMinus $1 $3 }
+|	Atom '/' Word						{ Fix $ DocSlash $1 $3 }
+|	Atom '=' Atom						{ Fix $ DocAssignOp AopEq $1 $3 }
+|	Atom '!=' Atom						{ Fix $ DocBinaryOp BopNe $1 $3 }
+|	Atom '>=' Atom						{ Fix $ DocBinaryOp BopGe $1 $3 }
+|	Atom '==' Atom						{ Fix $ DocBinaryOp BopEq $1 $3 }
+|	'@ref' Atom						{ Fix $ DocRef $2 }
+|	'@p' Atom						{ Fix $ DocP $2 }
+|	'(' Word						{ Fix $ DocLParen $2 }
+|	Word ')'						{ Fix $ DocRParen $1 }
+
+Atom :: { Term }
+Atom
+:	CMT_WORD						{ $1 }
+|	CMT_CODE						{ $1 }
+|	LIT_INTEGER						{ $1 }
+|	LIT_STRING						{ $1 }
+|	'...'							{ $1 }
+
+ConstExpr :: { [Term] }
+ConstExpr
+:	Atom							{ [$1] }
+|	'-' Atom						{ [$1, $2] }
+
+{
+type Term = Lexeme Text
+type NonTerm = Comment Term
+
+failAt :: Lexeme Text -> String -> ParseResult a
+failAt n msg =
+    fail $ Text.unpack (sloc "" n) <> ": unexpected " <> describeLexeme n <> msg
+
+parseError :: ([Lexeme Text], [String]) -> ParseResult a
+parseError ([], options)  = fail $ " end of comment; expected one of " <> show options
+parseError (n:_, [])      = failAt n "; expected end of comment"
+parseError (n:_, options) = failAt n $ "; expected one of " <> show options
+}
diff --git a/src/Language/Cimple/DescribeAst.hs b/src/Language/Cimple/DescribeAst.hs
@@ -4,6 +4,7 @@
 {-# LANGUAGE StrictData        #-}
 module Language.Cimple.DescribeAst
     ( HasLocation (..)
+    , describeLexeme
     , describeNode
     ) where
 
@@ -37,3 +38,6 @@ describeNode node = case unFix node of
   where
     ellipsis :: String
     ellipsis = "..."
+
+describeLexeme :: Show a => Lexeme a -> String
+describeLexeme = show
diff --git a/src/Language/Cimple/Flatten.hs b/src/Language/Cimple/Flatten.hs
@@ -7,10 +7,12 @@
 {-# LANGUAGE TypeOperators         #-}
 module Language.Cimple.Flatten (lexemes) where
 
+import           Data.Fix            (Fix (..))
 import           Data.Maybe          (maybeToList)
 import           GHC.Generics
-import           Language.Cimple.Ast (AssignOp, BinaryOp, CommentStyle,
-                                      LiteralType, NodeF (..), Scope, UnaryOp)
+import           Language.Cimple.Ast (AssignOp, BinaryOp, CommentF,
+                                      CommentStyle, LiteralType, NodeF (..),
+                                      Scope, UnaryOp)
 
 class Concats t a where
     concats :: t -> [a]
@@ -53,11 +55,15 @@ instance {-# OVERLAPPING #-} GenConcatsFlatten a a where
 instance {-# OVERLAPPABLE #-} GenConcatsFlatten b a => GenConcatsFlatten [b] a where
     gconcatsFlatten = concatMap gconcatsFlatten
 
+instance GenConcatsFlatten (Fix (CommentF a)) a where
+    gconcatsFlatten = error "TODO: gconcatsFlatten for CommentF"
+
 instance GenConcatsFlatten t a => GenConcats (Rec0 t) a where
     gconcats (K1 x) = gconcatsFlatten x
 
 -- Uses the default signature, which delegates to the generic stuff
 instance Concats (NodeF a [a]) a
+instance Concats (CommentF a [a]) a
 
 lexemes :: NodeF lexeme [lexeme] -> [lexeme]
 lexemes = concats
diff --git a/src/Language/Cimple/IO.hs b/src/Language/Cimple/IO.hs
@@ -21,6 +21,7 @@ import           Language.Cimple.Lexer           (Lexeme, runAlex)
 import           Language.Cimple.MapAst          (TextActions, mapAst,
                                                   textActions)
 import qualified Language.Cimple.Parser          as Parser
+import qualified Language.Cimple.ParseResult     as ParseResult
 import           Language.Cimple.Program         (Program)
 import qualified Language.Cimple.Program         as Program
 import           Language.Cimple.TranslationUnit (TranslationUnit)
@@ -51,7 +52,7 @@ parseText contents =
 
 parseTextPedantic :: Text -> Either String [TextNode]
 parseTextPedantic =
-    parseText >=> TreeParser.toEither . TreeParser.parseTranslationUnit
+    parseText >=> ParseResult.toEither . TreeParser.parseTranslationUnit
 
 
 parseFile :: FilePath -> IO (Either String (TranslationUnit Text))