From 54fb59b7141a0511761a56ac7dc1c720b5610b6a Mon Sep 17 00:00:00 2001
From: iphydf <iphydf@users.noreply.github.com>
Date: Sun, 20 Mar 2022 22:39:43 +0000
Subject: [PATCH] perf: Use ByteString for lexer; decode to Text immediately.

This makes the end-to-end parsing about 2x faster. We can now parse all
of toxcore in 0.45 seconds on my machine, making it parse around
3.4MiB/s (including TreeParser and CommentParser).

The lexer consumes 0.11s out of those 0.45s, running at around 9.8MiB/s.
---
 BUILD.bazel                  |  5 ++++-
 cimple.cabal                 |  2 +-
 src/Language/Cimple/IO.hs    | 30 ++++++++++++--------------
 src/Language/Cimple/Lexer.x  | 25 +++++++++++-----------
 src/Language/Cimple/Parser.y | 39 ++++++++++++++++++----------------
 tools/BUILD.bazel            | 14 +++++++++++-
 tools/count-tokens.hs        | 41 ++++++++++++++++++++++++++++++++++++
 tools/dump-tokens.hs         | 20 ++++++++----------
 8 files changed, 116 insertions(+), 60 deletions(-)
 create mode 100644 tools/count-tokens.hs

diff --git a/BUILD.bazel b/BUILD.bazel
index 51dee09..79476a0 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -25,6 +25,8 @@ haskell_library(
         hazel_library("aeson"),
         hazel_library("array"),
         hazel_library("base"),
+        hazel_library("bytestring"),
+        hazel_library("text"),
     ],
 )
 
@@ -77,6 +79,7 @@ haskell_library(
         hazel_library("base"),
         hazel_library("data-fix"),
         hazel_library("recursion-schemes"),
+        hazel_library("text"),
         hazel_library("transformers-compat"),
     ],
 )
@@ -139,7 +142,7 @@ haskell_library(
         ],
     ),
     src_strip_prefix = "src",
-    version = "0.0.16",
+    version = "0.0.17",
     visibility = ["//visibility:public"],
     deps = [
         ":ast",
diff --git a/cimple.cabal b/cimple.cabal
index 390476d..f191ba3 100644
--- a/cimple.cabal
+++ b/cimple.cabal
@@ -1,5 +1,5 @@
 name:                 cimple
-version:              0.0.16
+version:              0.0.17
 synopsis:             Simple C-like programming language
 homepage:             https://toktok.github.io/
 license:              GPL-3
diff --git a/src/Language/Cimple/IO.hs b/src/Language/Cimple/IO.hs
index daea8f7..07fc664 100644
--- a/src/Language/Cimple/IO.hs
+++ b/src/Language/Cimple/IO.hs
@@ -10,11 +10,10 @@ module Language.Cimple.IO
 import           Control.Monad                   ((>=>))
 import qualified Control.Monad.Parallel          as P
 import           Control.Monad.State.Strict      (State, evalState, get, put)
-import qualified Data.ByteString                 as BS
+import qualified Data.ByteString.Lazy            as LBS
 import           Data.Map.Strict                 (Map)
 import qualified Data.Map.Strict                 as Map
 import           Data.Text                       (Text)
-import qualified Data.Text                       as Text
 import qualified Data.Text.Encoding              as Text
 import           Language.Cimple.Ast             (Node)
 import           Language.Cimple.Lexer           (Lexeme, runAlex)
@@ -27,37 +26,36 @@ import qualified Language.Cimple.Program         as Program
 import           Language.Cimple.TranslationUnit (TranslationUnit)
 import qualified Language.Cimple.TreeParser      as TreeParser
 
-type StringNode = Node (Lexeme String)
 type TextNode = Node (Lexeme Text)
 
-toTextAst :: [StringNode] -> [TextNode]
-toTextAst stringAst =
-    evalState (mapAst cacheActions stringAst) Map.empty
+cacheText :: [TextNode] -> [TextNode]
+cacheText textAst =
+    evalState (mapAst cacheActions textAst) Map.empty
   where
-    cacheActions :: TextActions (State (Map String Text)) String Text
+    cacheActions :: TextActions (State (Map Text Text)) Text Text
     cacheActions = textActions $ \s -> do
         m <- get
         case Map.lookup s m of
             Nothing -> do
-                let text = Text.pack s
-                put $ Map.insert s text m
-                return text
+                put $ Map.insert s s m
+                return s
             Just text ->
                 return text
 
 
 parseText :: Text -> Either String [TextNode]
-parseText contents =
-    toTextAst <$> runAlex (Text.unpack contents) Parser.parseTranslationUnit
+parseText = fmap cacheText . flip runAlex Parser.parseTranslationUnit . LBS.fromStrict . Text.encodeUtf8
+
+parseBytes :: LBS.ByteString -> Either String [TextNode]
+parseBytes = flip runAlex Parser.parseTranslationUnit
 
-parseTextPedantic :: Text -> Either String [TextNode]
-parseTextPedantic =
-    parseText >=> ParseResult.toEither . TreeParser.parseTranslationUnit
+parseBytesPedantic :: LBS.ByteString -> Either String [TextNode]
+parseBytesPedantic = parseBytes >=> ParseResult.toEither . TreeParser.parseTranslationUnit
 
 
 parseFile :: FilePath -> IO (Either String (TranslationUnit Text))
 parseFile source =
-    addSource . parseTextPedantic . Text.decodeUtf8 <$> BS.readFile source
+    addSource . parseBytesPedantic <$> LBS.readFile source
   where
     -- Add source filename to the error message, if any.
     addSource (Left err) = Left $ source <> err
diff --git a/src/Language/Cimple/Lexer.x b/src/Language/Cimple/Lexer.x
index 825a149..f496907 100644
--- a/src/Language/Cimple/Lexer.x
+++ b/src/Language/Cimple/Lexer.x
@@ -4,6 +4,7 @@
 {-# LANGUAGE DeriveTraversable  #-}
 {-# LANGUAGE StandaloneDeriving #-}
 {-# LANGUAGE StrictData         #-}
+{-# OPTIONS_GHC -Wno-unused-imports #-}
 module Language.Cimple.Lexer
     ( Alex
     , AlexPosn (..)
@@ -15,16 +16,20 @@ module Language.Cimple.Lexer
     , lexemePosn
     , lexemeText
     , lexemeLine
-    , mkL
     , runAlex
     ) where
 
 import           Data.Aeson             (FromJSON, ToJSON)
+import qualified Data.ByteString        as BS
+import qualified Data.ByteString.Lazy   as LBS
+import           Data.Text              (Text)
+import qualified Data.Text              as Text
+import qualified Data.Text.Encoding     as Text
 import           GHC.Generics           (Generic)
 import           Language.Cimple.Tokens (LexemeClass (..))
 }
 
-%wrapper "monad"
+%wrapper "monad-bytestring"
 
 tokens :-
 
@@ -302,8 +307,9 @@ data Lexeme text = L AlexPosn LexemeClass text
 instance FromJSON text => FromJSON (Lexeme text)
 instance ToJSON text => ToJSON (Lexeme text)
 
-mkL :: Applicative m => LexemeClass -> AlexInput -> Int -> m (Lexeme String)
-mkL c (p, _, _, str) len = pure $ L p c (take len str)
+mkL :: LexemeClass -> AlexInput -> Int64 -> Alex (Lexeme Text)
+mkL c (p, _, str, _) len = pure $ L p c (piece str)
+  where piece = Text.decodeUtf8 . LBS.toStrict . LBS.take len
 
 lexemePosn :: Lexeme text -> AlexPosn
 lexemePosn (L p _ _) = p
@@ -317,15 +323,10 @@ lexemeText (L _ _ s) = s
 lexemeLine :: Lexeme text -> Int
 lexemeLine (L (AlexPn _ l _) _ _) = l
 
-start :: Int -> AlexInput -> Int -> Alex (Lexeme String)
-start code _ _ = do
-    alexSetStartCode code
-    alexMonadScan
+alexEOF :: Alex (Lexeme Text)
+alexEOF = return (L (AlexPn 0 0 0) Eof Text.empty)
 
-alexEOF :: Alex (Lexeme String)
-alexEOF = return (L (AlexPn 0 0 0) Eof "")
-
-alexScanTokens :: String -> Either String [Lexeme String]
+alexScanTokens :: LBS.ByteString -> Either String [Lexeme Text]
 alexScanTokens str =
     runAlex str $ loop []
   where
diff --git a/src/Language/Cimple/Parser.y b/src/Language/Cimple/Parser.y
index 73e1b5c..d8f2769 100644
--- a/src/Language/Cimple/Parser.y
+++ b/src/Language/Cimple/Parser.y
@@ -1,9 +1,12 @@
 {
+{-# LANGUAGE OverloadedStrings #-}
 module Language.Cimple.Parser
     ( parseTranslationUnit
     ) where
 
 import           Data.Fix               (Fix (..))
+import           Data.Text              (Text)
+import qualified Data.Text              as Text
 import           Language.Cimple.Ast    (AssignOp (..), BinaryOp (..),
                                          CommentStyle (..), LiteralType (..),
                                          Node, NodeF (..), Scope (..),
@@ -22,7 +25,7 @@ import           Language.Cimple.Tokens (LexemeClass (..))
 %errorhandlertype explist
 %lexer {lexwrap} {L _ Eof _}
 %monad {Alex}
-%tokentype {StringLexeme}
+%tokentype {Term}
 %token
     ID_CONST			{ L _ IdConst			_ }
     ID_FUNC_TYPE		{ L _ IdFuncType		_ }
@@ -173,12 +176,12 @@ CopyrightDecl :: { NonTerm }
 CopyrightDecl
 :	' ' 'Copyright' CopyrightDates CopyrightOwner '\n'	{ Fix $ CopyrightDecl (fst $3) (snd $3) $4 }
 
-CopyrightDates :: { (StringLexeme, Maybe StringLexeme) }
+CopyrightDates :: { (Term, Maybe Term) }
 CopyrightDates
 :	LIT_INTEGER						{ ($1, Nothing) }
 |	LIT_INTEGER '-' LIT_INTEGER				{ ($1, Just $3) }
 
-CopyrightOwner :: { [StringLexeme] }
+CopyrightOwner :: { [Term] }
 CopyrightOwner
 :	CMT_WORD CommentWords					{ $1 : reverse $2 }
 
@@ -216,24 +219,24 @@ Comment
 |	'/** @} */'						{ Fix $ CommentSectionEnd $1 }
 |	Ignore							{ $1 }
 
-CommentTokens :: { [StringLexeme] }
+CommentTokens :: { [Term] }
 CommentTokens
 :	CommentToken						{ [$1] }
 |	CommentTokens CommentToken				{ $2 : $1 }
 
-CommentToken :: { StringLexeme }
+CommentToken :: { Term }
 CommentToken
 :	CommentWord						{ $1 }
 |	'\n'							{ $1 }
 |	' * '							{ $1 }
 |	' '							{ $1 }
 
-CommentWords :: { [StringLexeme] }
+CommentWords :: { [Term] }
 CommentWords
 :								{ [] }
 |	CommentWords CommentWord				{ $2 : $1 }
 
-CommentWord :: { StringLexeme }
+CommentWord :: { Term }
 CommentWord
 :	CMT_WORD						{ $1 }
 |	CMT_COMMAND						{ $1 }
@@ -265,7 +268,7 @@ Ignore :: { NonTerm }
 Ignore
 :	IGN_START IgnoreBody IGN_END				{ Fix $ Comment Ignore $1 (reverse $2) $3 }
 
-IgnoreBody :: { [StringLexeme] }
+IgnoreBody :: { [Term] }
 IgnoreBody
 :	IGN_BODY						{ [$1] }
 |	IgnoreBody IGN_BODY					{ $2 : $1 }
@@ -602,7 +605,7 @@ Enumerator
 |	EnumeratorName '=' ConstExpr ','			{ Fix $ Enumerator $1 (Just $3) }
 |	Comment							{ $1 }
 
-EnumeratorName :: { StringLexeme }
+EnumeratorName :: { Term }
 EnumeratorName
 :	ID_CONST						{ $1 }
 |	ID_SUE_TYPE						{ $1 }
@@ -683,11 +686,11 @@ NonNull
 |	nullable '(' Ints ')'					{ Fix . NonNull [] $3 }
 |	non_null '(' Ints ')' nullable '(' Ints ')'		{ Fix . NonNull $3 $7 }
 
-Ints :: { [StringLexeme] }
+Ints :: { [Term] }
 Ints
 :	IntList							{ reverse $1 }
 
-IntList :: { [StringLexeme] }
+IntList :: { [Term] }
 IntList
 :	LIT_INTEGER						{ [$1] }
 |	IntList ',' LIT_INTEGER					{ $3 : $1 }
@@ -729,8 +732,8 @@ ConstDecl
 |	static const LeafType ID_VAR '=' InitialiserExpr ';'	{ Fix $ ConstDefn Static $3 $4 $6 }
 
 {
-type StringLexeme = Lexeme String
-type NonTerm = Node StringLexeme
+type Term = Lexeme Text
+type NonTerm = Node Term
 
 tyPointer, tyConst :: NonTerm -> NonTerm
 tyPointer = Fix . TyPointer
@@ -741,14 +744,14 @@ parseError (L (AlexPn _ line col) c t, options) =
     alexError $ ":" <> show line <> ":" <> show col <> ": Parse error near " <> show c <> ": "
         <> show t <> "; expected one of " <> show options
 
-lexwrap :: (Lexeme String -> Alex a) -> Alex a
+lexwrap :: (Lexeme Text -> Alex a) -> Alex a
 lexwrap = (alexMonadScan >>=)
 
 externC
-    :: Lexeme String
-    -> Lexeme String
+    :: Term
+    -> Term
     -> [NonTerm]
-    -> Lexeme String
+    -> Term
     -> Alex NonTerm
 externC (L _ _ "__cplusplus") (L _ _ "\"C\"") decls (L _ _ "__cplusplus") =
     return $ Fix $ ExternC decls
@@ -758,7 +761,7 @@ externC _ lang _ _ =
 
 macroBodyStmt
     :: NonTerm
-    -> Lexeme String
+    -> Term
     -> Alex NonTerm
 macroBodyStmt decls (L _ _ "0") =
     return $ Fix $ MacroBodyStmt decls
diff --git a/tools/BUILD.bazel b/tools/BUILD.bazel
index 138a919..d17e1b1 100644
--- a/tools/BUILD.bazel
+++ b/tools/BUILD.bazel
@@ -25,6 +25,19 @@ haskell_binary(
     ],
 )
 
+haskell_binary(
+    name = "count-tokens",
+    srcs = ["count-tokens.hs"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//hs-cimple",
+        hazel_library("base"),
+        hazel_library("bytestring"),
+        hazel_library("text"),
+        hazel_library("time"),
+    ],
+)
+
 haskell_binary(
     name = "dump-tokens",
     srcs = ["dump-tokens.hs"],
@@ -34,7 +47,6 @@ haskell_binary(
         hazel_library("base"),
         hazel_library("bytestring"),
         hazel_library("groom"),
-        hazel_library("text"),
     ],
 )
 
diff --git a/tools/count-tokens.hs b/tools/count-tokens.hs
new file mode 100644
index 0000000..9f3f3d4
--- /dev/null
+++ b/tools/count-tokens.hs
@@ -0,0 +1,41 @@
+{-# LANGUAGE Strict #-}
+module Main (main) where
+
+import qualified Data.ByteString.Lazy as LBS
+import qualified Data.Text            as Text
+import           Data.Time.Clock      (diffUTCTime, getCurrentTime)
+import           Language.Cimple      (Alex, Lexeme (..), LexemeClass (..),
+                                       alexMonadScan, runAlex)
+import           System.Environment   (getArgs)
+
+
+countTokens :: LBS.ByteString -> Either String (Int, Int)
+countTokens str = runAlex str $ loop 0 0
+  where
+    loop :: Int -> Int -> Alex (Int, Int)
+    loop toks len = do
+        (L _ c t) <- alexMonadScan
+        if c == Eof
+            then return (toks, len)
+            else loop (toks + 1) (len + Text.length t)
+
+processFile :: FilePath -> IO (Int, Int)
+processFile source = do
+    contents <- LBS.readFile source
+    case countTokens contents of
+        Left err -> fail err
+        Right ok -> return ok
+
+processFiles :: [FilePath] -> IO (Int, Int)
+processFiles = fmap ((\(a, b) -> (sum a, sum b)) . unzip) . mapM processFile
+
+main :: IO ()
+main = do
+    sources <- getArgs
+    start <- getCurrentTime
+    (toks, len) <- processFiles sources
+    end <- getCurrentTime
+    putStrLn $ "Tokenised " <> show (length sources) <> " sources into "
+        <> show toks <> " lexemes (" <> show len <> " bytes) in "
+        <> show (diffUTCTime end start) <> " ("
+        <> show (fromIntegral len / 1024 / 1024 / diffUTCTime end start) <> " MiB/s)"
diff --git a/tools/dump-tokens.hs b/tools/dump-tokens.hs
index cd90a92..5fc829a 100644
--- a/tools/dump-tokens.hs
+++ b/tools/dump-tokens.hs
@@ -1,23 +1,21 @@
 module Main (main) where
 
-import qualified Data.ByteString    as BS
-import qualified Data.Text          as Text
-import qualified Data.Text.Encoding as Text
-import           Language.Cimple    (alexScanTokens)
-import           System.Environment (getArgs)
-import           Text.Groom         (groom)
+import qualified Data.ByteString.Lazy as LBS
+import           Language.Cimple      (alexScanTokens)
+import           System.Environment   (getArgs)
+import           Text.Groom           (groom)
 
 processFile :: FilePath -> IO ()
 processFile source = do
     putStrLn $ "Processing " ++ source
-    contents <- Text.unpack . Text.decodeUtf8 <$> BS.readFile source
+    contents <- LBS.readFile source
     case alexScanTokens contents of
         Left err -> fail err
         Right ok -> putStrLn $ groom ok
 
 main :: IO ()
 main = do
-  args <- getArgs
-  case args of
-    [src] -> processFile src
-    _     -> fail "Usage: dump-tokens <file.c>"
+    args <- getArgs
+    case args of
+      [src] -> processFile src
+      _     -> fail "Usage: dump-tokens <file.c>"