Skip to content

Commit

Permalink
perf: Use ByteString for lexer; decode to Text immediately.
Browse files Browse the repository at this point in the history
This makes the end-to-end parsing about 2x faster. We can now parse all
of toxcore in 0.45 seconds on my machine, making it parse around
3.4MiB/s (including TreeParser and CommentParser).

The lexer consumes 0.11s out of those 0.45s, running at around 9.8MiB/s.
  • Loading branch information
iphydf committed Mar 20, 2022
1 parent 38bbf56 commit 54fb59b
Show file tree
Hide file tree
Showing 8 changed files with 116 additions and 60 deletions.
5 changes: 4 additions & 1 deletion BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ haskell_library(
hazel_library("aeson"),
hazel_library("array"),
hazel_library("base"),
hazel_library("bytestring"),
hazel_library("text"),
],
)

Expand Down Expand Up @@ -77,6 +79,7 @@ haskell_library(
hazel_library("base"),
hazel_library("data-fix"),
hazel_library("recursion-schemes"),
hazel_library("text"),
hazel_library("transformers-compat"),
],
)
Expand Down Expand Up @@ -139,7 +142,7 @@ haskell_library(
],
),
src_strip_prefix = "src",
version = "0.0.16",
version = "0.0.17",
visibility = ["//visibility:public"],
deps = [
":ast",
Expand Down
2 changes: 1 addition & 1 deletion cimple.cabal
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: cimple
version: 0.0.16
version: 0.0.17
synopsis: Simple C-like programming language
homepage: https://toktok.github.io/
license: GPL-3
Expand Down
30 changes: 14 additions & 16 deletions src/Language/Cimple/IO.hs
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,10 @@ module Language.Cimple.IO
import Control.Monad ((>=>))
import qualified Control.Monad.Parallel as P
import Control.Monad.State.Strict (State, evalState, get, put)
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as LBS
import Data.Map.Strict (Map)
import qualified Data.Map.Strict as Map
import Data.Text (Text)
import qualified Data.Text as Text
import qualified Data.Text.Encoding as Text
import Language.Cimple.Ast (Node)
import Language.Cimple.Lexer (Lexeme, runAlex)
Expand All @@ -27,37 +26,36 @@ import qualified Language.Cimple.Program as Program
import Language.Cimple.TranslationUnit (TranslationUnit)
import qualified Language.Cimple.TreeParser as TreeParser

type StringNode = Node (Lexeme String)
type TextNode = Node (Lexeme Text)

toTextAst :: [StringNode] -> [TextNode]
toTextAst stringAst =
evalState (mapAst cacheActions stringAst) Map.empty
cacheText :: [TextNode] -> [TextNode]
cacheText textAst =
evalState (mapAst cacheActions textAst) Map.empty
where
cacheActions :: TextActions (State (Map String Text)) String Text
cacheActions :: TextActions (State (Map Text Text)) Text Text
cacheActions = textActions $ \s -> do
m <- get
case Map.lookup s m of
Nothing -> do
let text = Text.pack s
put $ Map.insert s text m
return text
put $ Map.insert s s m
return s
Just text ->
return text


parseText :: Text -> Either String [TextNode]
parseText contents =
toTextAst <$> runAlex (Text.unpack contents) Parser.parseTranslationUnit
parseText = fmap cacheText . flip runAlex Parser.parseTranslationUnit . LBS.fromStrict . Text.encodeUtf8

parseBytes :: LBS.ByteString -> Either String [TextNode]
parseBytes = flip runAlex Parser.parseTranslationUnit

parseTextPedantic :: Text -> Either String [TextNode]
parseTextPedantic =
parseText >=> ParseResult.toEither . TreeParser.parseTranslationUnit
parseBytesPedantic :: LBS.ByteString -> Either String [TextNode]
parseBytesPedantic = parseBytes >=> ParseResult.toEither . TreeParser.parseTranslationUnit


parseFile :: FilePath -> IO (Either String (TranslationUnit Text))
parseFile source =
addSource . parseTextPedantic . Text.decodeUtf8 <$> BS.readFile source
addSource . parseBytesPedantic <$> LBS.readFile source
where
-- Add source filename to the error message, if any.
addSource (Left err) = Left $ source <> err
Expand Down
25 changes: 13 additions & 12 deletions src/Language/Cimple/Lexer.x
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
{-# LANGUAGE DeriveTraversable #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE StrictData #-}
{-# OPTIONS_GHC -Wno-unused-imports #-}
module Language.Cimple.Lexer
( Alex
, AlexPosn (..)
Expand All @@ -15,16 +16,20 @@ module Language.Cimple.Lexer
, lexemePosn
, lexemeText
, lexemeLine
, mkL
, runAlex
) where

import Data.Aeson (FromJSON, ToJSON)
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as LBS
import Data.Text (Text)
import qualified Data.Text as Text
import qualified Data.Text.Encoding as Text
import GHC.Generics (Generic)
import Language.Cimple.Tokens (LexemeClass (..))
}

%wrapper "monad"
%wrapper "monad-bytestring"

tokens :-

Expand Down Expand Up @@ -302,8 +307,9 @@ data Lexeme text = L AlexPosn LexemeClass text
instance FromJSON text => FromJSON (Lexeme text)
instance ToJSON text => ToJSON (Lexeme text)

mkL :: Applicative m => LexemeClass -> AlexInput -> Int -> m (Lexeme String)
mkL c (p, _, _, str) len = pure $ L p c (take len str)
mkL :: LexemeClass -> AlexInput -> Int64 -> Alex (Lexeme Text)
mkL c (p, _, str, _) len = pure $ L p c (piece str)
where piece = Text.decodeUtf8 . LBS.toStrict . LBS.take len

lexemePosn :: Lexeme text -> AlexPosn
lexemePosn (L p _ _) = p
Expand All @@ -317,15 +323,10 @@ lexemeText (L _ _ s) = s
lexemeLine :: Lexeme text -> Int
lexemeLine (L (AlexPn _ l _) _ _) = l

start :: Int -> AlexInput -> Int -> Alex (Lexeme String)
start code _ _ = do
alexSetStartCode code
alexMonadScan
alexEOF :: Alex (Lexeme Text)
alexEOF = return (L (AlexPn 0 0 0) Eof Text.empty)

alexEOF :: Alex (Lexeme String)
alexEOF = return (L (AlexPn 0 0 0) Eof "")

alexScanTokens :: String -> Either String [Lexeme String]
alexScanTokens :: LBS.ByteString -> Either String [Lexeme Text]
alexScanTokens str =
runAlex str $ loop []
where
Expand Down
39 changes: 21 additions & 18 deletions src/Language/Cimple/Parser.y
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
{
{-# LANGUAGE OverloadedStrings #-}
module Language.Cimple.Parser
( parseTranslationUnit
) where

import Data.Fix (Fix (..))
import Data.Text (Text)
import qualified Data.Text as Text
import Language.Cimple.Ast (AssignOp (..), BinaryOp (..),
CommentStyle (..), LiteralType (..),
Node, NodeF (..), Scope (..),
Expand All @@ -22,7 +25,7 @@ import Language.Cimple.Tokens (LexemeClass (..))
%errorhandlertype explist
%lexer {lexwrap} {L _ Eof _}
%monad {Alex}
%tokentype {StringLexeme}
%tokentype {Term}
%token
ID_CONST { L _ IdConst _ }
ID_FUNC_TYPE { L _ IdFuncType _ }
Expand Down Expand Up @@ -173,12 +176,12 @@ CopyrightDecl :: { NonTerm }
CopyrightDecl
: ' ' 'Copyright' CopyrightDates CopyrightOwner '\n' { Fix $ CopyrightDecl (fst $3) (snd $3) $4 }

CopyrightDates :: { (StringLexeme, Maybe StringLexeme) }
CopyrightDates :: { (Term, Maybe Term) }
CopyrightDates
: LIT_INTEGER { ($1, Nothing) }
| LIT_INTEGER '-' LIT_INTEGER { ($1, Just $3) }

CopyrightOwner :: { [StringLexeme] }
CopyrightOwner :: { [Term] }
CopyrightOwner
: CMT_WORD CommentWords { $1 : reverse $2 }

Expand Down Expand Up @@ -216,24 +219,24 @@ Comment
| '/** @} */' { Fix $ CommentSectionEnd $1 }
| Ignore { $1 }

CommentTokens :: { [StringLexeme] }
CommentTokens :: { [Term] }
CommentTokens
: CommentToken { [$1] }
| CommentTokens CommentToken { $2 : $1 }

CommentToken :: { StringLexeme }
CommentToken :: { Term }
CommentToken
: CommentWord { $1 }
| '\n' { $1 }
| ' * ' { $1 }
| ' ' { $1 }

CommentWords :: { [StringLexeme] }
CommentWords :: { [Term] }
CommentWords
: { [] }
| CommentWords CommentWord { $2 : $1 }

CommentWord :: { StringLexeme }
CommentWord :: { Term }
CommentWord
: CMT_WORD { $1 }
| CMT_COMMAND { $1 }
Expand Down Expand Up @@ -265,7 +268,7 @@ Ignore :: { NonTerm }
Ignore
: IGN_START IgnoreBody IGN_END { Fix $ Comment Ignore $1 (reverse $2) $3 }

IgnoreBody :: { [StringLexeme] }
IgnoreBody :: { [Term] }
IgnoreBody
: IGN_BODY { [$1] }
| IgnoreBody IGN_BODY { $2 : $1 }
Expand Down Expand Up @@ -602,7 +605,7 @@ Enumerator
| EnumeratorName '=' ConstExpr ',' { Fix $ Enumerator $1 (Just $3) }
| Comment { $1 }

EnumeratorName :: { StringLexeme }
EnumeratorName :: { Term }
EnumeratorName
: ID_CONST { $1 }
| ID_SUE_TYPE { $1 }
Expand Down Expand Up @@ -683,11 +686,11 @@ NonNull
| nullable '(' Ints ')' { Fix . NonNull [] $3 }
| non_null '(' Ints ')' nullable '(' Ints ')' { Fix . NonNull $3 $7 }

Ints :: { [StringLexeme] }
Ints :: { [Term] }
Ints
: IntList { reverse $1 }

IntList :: { [StringLexeme] }
IntList :: { [Term] }
IntList
: LIT_INTEGER { [$1] }
| IntList ',' LIT_INTEGER { $3 : $1 }
Expand Down Expand Up @@ -729,8 +732,8 @@ ConstDecl
| static const LeafType ID_VAR '=' InitialiserExpr ';' { Fix $ ConstDefn Static $3 $4 $6 }

{
type StringLexeme = Lexeme String
type NonTerm = Node StringLexeme
type Term = Lexeme Text
type NonTerm = Node Term

tyPointer, tyConst :: NonTerm -> NonTerm
tyPointer = Fix . TyPointer
Expand All @@ -741,14 +744,14 @@ parseError (L (AlexPn _ line col) c t, options) =
alexError $ ":" <> show line <> ":" <> show col <> ": Parse error near " <> show c <> ": "
<> show t <> "; expected one of " <> show options
lexwrap :: (Lexeme String -> Alex a) -> Alex a
lexwrap :: (Lexeme Text -> Alex a) -> Alex a
lexwrap = (alexMonadScan >>=)
externC
:: Lexeme String
-> Lexeme String
:: Term
-> Term
-> [NonTerm]
-> Lexeme String
-> Term
-> Alex NonTerm
externC (L _ _ "__cplusplus") (L _ _ "\"C\"") decls (L _ _ "__cplusplus") =
return $ Fix $ ExternC decls
Expand All @@ -758,7 +761,7 @@ externC _ lang _ _ =

macroBodyStmt
:: NonTerm
-> Lexeme String
-> Term
-> Alex NonTerm
macroBodyStmt decls (L _ _ "0") =
return $ Fix $ MacroBodyStmt decls
Expand Down
14 changes: 13 additions & 1 deletion tools/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,19 @@ haskell_binary(
],
)

haskell_binary(
name = "count-tokens",
srcs = ["count-tokens.hs"],
visibility = ["//visibility:public"],
deps = [
"//hs-cimple",
hazel_library("base"),
hazel_library("bytestring"),
hazel_library("text"),
hazel_library("time"),
],
)

haskell_binary(
name = "dump-tokens",
srcs = ["dump-tokens.hs"],
Expand All @@ -34,7 +47,6 @@ haskell_binary(
hazel_library("base"),
hazel_library("bytestring"),
hazel_library("groom"),
hazel_library("text"),
],
)

Expand Down
41 changes: 41 additions & 0 deletions tools/count-tokens.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
{-# LANGUAGE Strict #-}
module Main (main) where

import qualified Data.ByteString.Lazy as LBS
import qualified Data.Text as Text
import Data.Time.Clock (diffUTCTime, getCurrentTime)
import Language.Cimple (Alex, Lexeme (..), LexemeClass (..),
alexMonadScan, runAlex)
import System.Environment (getArgs)


countTokens :: LBS.ByteString -> Either String (Int, Int)
countTokens str = runAlex str $ loop 0 0
where
loop :: Int -> Int -> Alex (Int, Int)
loop toks len = do
(L _ c t) <- alexMonadScan
if c == Eof
then return (toks, len)
else loop (toks + 1) (len + Text.length t)

processFile :: FilePath -> IO (Int, Int)
processFile source = do
contents <- LBS.readFile source
case countTokens contents of
Left err -> fail err
Right ok -> return ok

processFiles :: [FilePath] -> IO (Int, Int)
processFiles = fmap ((\(a, b) -> (sum a, sum b)) . unzip) . mapM processFile

main :: IO ()
main = do
sources <- getArgs
start <- getCurrentTime
(toks, len) <- processFiles sources
end <- getCurrentTime
putStrLn $ "Tokenised " <> show (length sources) <> " sources into "
<> show toks <> " lexemes (" <> show len <> " bytes) in "
<> show (diffUTCTime end start) <> " ("
<> show (fromIntegral len / 1024 / 1024 / diffUTCTime end start) <> " MiB/s)"
20 changes: 9 additions & 11 deletions tools/dump-tokens.hs
Original file line number Diff line number Diff line change
@@ -1,23 +1,21 @@
module Main (main) where

import qualified Data.ByteString as BS
import qualified Data.Text as Text
import qualified Data.Text.Encoding as Text
import Language.Cimple (alexScanTokens)
import System.Environment (getArgs)
import Text.Groom (groom)
import qualified Data.ByteString.Lazy as LBS
import Language.Cimple (alexScanTokens)
import System.Environment (getArgs)
import Text.Groom (groom)

processFile :: FilePath -> IO ()
processFile source = do
putStrLn $ "Processing " ++ source
contents <- Text.unpack . Text.decodeUtf8 <$> BS.readFile source
contents <- LBS.readFile source
case alexScanTokens contents of
Left err -> fail err
Right ok -> putStrLn $ groom ok

main :: IO ()
main = do
args <- getArgs
case args of
[src] -> processFile src
_ -> fail "Usage: dump-tokens <file.c>"
args <- getArgs
case args of
[src] -> processFile src
_ -> fail "Usage: dump-tokens <file.c>"

0 comments on commit 54fb59b

Please sign in to comment.