Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changes for UTF8 character representation within string literals #74

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions src/Makefile.inc
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Unix configuration Makefile for Moscow ML -*- mode: makefile -*-

# Where to install stuff
PREFIX=/usr/local
PREFIX=/home/igrant/local

# BINDIR contains true executable files, such as scripts
# LIBDIR contains bytecode files (such as mosmlcmp and library units), and .dll/.so for dynlibs.
Expand Down Expand Up @@ -37,7 +37,7 @@ BASELIBS=-lm

# This works with most systems, including MacOS X with XCode installed:

CC=gcc
CC=egcc
# CC=gcc -mmacosx-version-min=10.7 # for building OS X package
# CC=/usr/sepp/bin/gcc # Solaris at KVL

Expand Down Expand Up @@ -68,8 +68,8 @@ UNAME_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
ifeq ($(UNAME_S),Linux)
CPP=cpp -P -traditional -Dunix -Umsdos -Wno-invalid-pp-token
STRIP=strip -S
LD=gcc -rdynamic -Wl,-rpath,$(LIBDIR)
DYNLD=gcc -shared
LD=$(CC) -rdynamic -Wl,-rpath,$(LIBDIR)
DYNLD=$(CC) -shared
endif
ifeq ($(UNAME_S),Darwin) # For MacOS X, use the same as Linux except DYNDL
CPP=cpp -P -traditional -Dunix -Umsdos -Wno-invalid-pp-token
Expand All @@ -92,8 +92,8 @@ ifeq ($(UNAME_S),OpenBSD)
ADDRUNLIBS=
CPP=cpp -P -traditional -Dunix -Umsdos -Wno-invalid-pp-token
STRIP=strip -S
LD=gcc -rdynamic -Wl,-rpath,$(LIBDIR)
DYNLD=gcc -shared
LD=$(CC) -rdynamic -Wl,-rpath,$(LIBDIR)
DYNLD=$(CC) -shared
endif

ifeq ($(UNAME_S),Custom) # Your configuration here
Expand Down
6 changes: 3 additions & 3 deletions src/compiler/Config.mlp
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,15 @@ val pervasiveOpenedUnits = ["General"];

val fulllib = ["Option", "List", "ListPair", "Strbase", "Char", "String",
"StringCvt", "TextIO", "BasicIO", "Vector",
"Array", "VectorSlice", "ArraySlice", "Misc", "Substring",
"Bool", "Int", "Real", "Math",
"Array", "VectorSlice", "ArraySlice", "Misc",
"Substring", "Bool", "Int", "Real", "Math",
"Word", "Word8", "Word8Vector", "Word8Array",
"Word8VectorSlice", "Word8ArraySlice", "Byte",
"BinIO", "CharVector", "CharArray",
"CharVectorSlice", "CharArraySlice",
"Time", "Timer", "Date", "Path",
"FileSys", "Process", "OS",
"Mosml", "PP", "CommandLine"]
"Mosml", "UTF8", "PP", "CommandLine"]

val preloadedUnitSets = [
("default", ["Option", "List", "Strbase", "Char", "String",
Expand Down
83 changes: 67 additions & 16 deletions src/compiler/Lexer.lex
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@ open Fnlib Memory Config Mixture Const Parser;

(* For Quote/Antiquote --- object language embedding. *)

val quotation = ref false
val quotation = ref false

val utf8 = ref false

datatype lexingMode =
NORMALlm
Expand Down Expand Up @@ -119,9 +121,17 @@ fun store_string_char c =
incr string_index
end

fun store_string_chars [] = ()
| store_string_chars (c::cs) = (store_string_char c; store_string_chars cs)

fun store_string s = store_string_chars (String.explode s)

fun extracta slc = CharArraySlice.vector(CharArraySlice.slice slc)
fun extractv slc = CharVectorSlice.vector(CharVectorSlice.slice slc)

fun get_stored_string() =
let open CharArraySlice
val s = vector(slice(!string_buff, 0, SOME (!string_index)))
let open CharArray
val s = extracta(!string_buff, 0, SOME (!string_index))
in
string_buff := initial_string_buffer;
s
Expand Down Expand Up @@ -207,6 +217,28 @@ fun scanString scan lexbuf =
setLexStartPos lexbuf (!savedLexemeStart - getLexAbsPos lexbuf)
)

fun hexval c =
if #"0" <= c andalso c <= #"9" then Char.ord c - 48
else (Char.ord c - 55) mod 32;

fun UTF8StringOfUCSEscapeSequence lexbuf i =
let
val s = getLexeme lexbuf
val sl = String.size s
fun skipPrefix n =
let val c = String.sub (s,n)
in if not (c = #"u" orelse c = #"U" orelse c = #"+") then n else skipPrefix (n+1)
end
fun hexCharsToWord n =
let fun iter acc n =
if n < sl
then iter (acc * 0x10 + (hexval (String.sub(s,n)))) (n + 1)
else acc
in Word.fromInt (iter 0 n)
end
in store_string (UTF8.UCStoUTF8String (hexCharsToWord (skipPrefix 1)))
end;

}

rule Token = parse
Expand Down Expand Up @@ -277,7 +309,7 @@ and TokenN = parse
{ scanString String lexbuf;
let val s = get_stored_string() in
if size s <> 1 then
lexError "ill-formed character constant" lexbuf
lexError "ill-formed (possibly multi-byte encoded) character constant" lexbuf
else ();
CHAR (CharVector.sub(s, 0))
end }
Expand Down Expand Up @@ -375,15 +407,19 @@ and String = parse
store_string_char(Char.chr code);
String lexbuf
end }
| "\\u" [`0`-`9``a`-`f``A`-`F`] [`0`-`9``a`-`f``A`-`F`]
[`0`-`9``a`-`f``A`-`F`] [`0`-`9``a`-`f``A`-`F`]
{ let val code = charCodeOfHexadecimal lexbuf 1 in
if code >= 256 then
skipString "character code is too large" SkipString lexbuf
else ();
store_string_char(Char.chr code);
String lexbuf
end }
| `\\` [`u``U`]
[`0`-`9``a`-`f``A`-`F`] [`0`-`9``a`-`f``A`-`F`]
[`0`-`9``a`-`f``A`-`F`] [`0`-`9``a`-`f``A`-`F`]
{ UTF8StringOfUCSEscapeSequence lexbuf 1
handle UTF8.BadUTF8 s => skipString s SkipString lexbuf;
String lexbuf }
| `\\` [`u``U`] `+`
[`0`-`9``a`-`f``A`-`F`]? [`0`-`9``a`-`f``A`-`F`]?
[`0`-`9``a`-`f``A`-`F`] [`0`-`9``a`-`f``A`-`F`]
[`0`-`9``a`-`f``A`-`F`] [`0`-`9``a`-`f``A`-`F`]
{ UTF8StringOfUCSEscapeSequence lexbuf 1
handle UTF8.BadUTF8 s => skipString s SkipString lexbuf;
String lexbuf }
| `\\`
{ skipString "ill-formed escape sequence" SkipString lexbuf }
| (eof | `\^Z`)
Expand All @@ -392,9 +428,24 @@ and String = parse
{ skipString "newline not permitted in string" SkipString lexbuf }
| [`\^A`-`\^Z` `\127` `\255`]
{ skipString "invalid character in string" SkipString lexbuf }
| _
{ (store_string_char(getLexemeChar lexbuf 0);
String lexbuf) }
| "" { UTF8Char lexbuf;
String lexbuf }

and UTF8Char = parse
[`\^@`-`\127`] { store_string_char(getLexemeChar lexbuf 0) }
| ( [`\194`-`\223`] [`\128`-`\191`]
| `\224` [`\160`-`\191`] [`\128`-`\191`]
| [`\225`-`\236`] [`\128`-`\191`] [`\128`-`\191`]
| `\237` [`\128`-`\159`] [`\128`-`\191`]
| [`\238``\239`] [`\128`-`\191`] [`\128`-`\191`]
| `\240` [`\144`-`\191`] [`\128`-`\191`] [`\128`-`\191`]
| [`\241`-`\243`] [`\128`-`\191`] [`\128`-`\191`] [`\128`-`\191`]
| `\244` [`\128`-`\143`] [`\128`-`\191`] [`\128`-`\191`]
) { store_string (getLexeme lexbuf) }
| _ { if !utf8
then lexError "ill-formed UTF8 character code" lexbuf
else store_string (getLexeme lexbuf)
}

and SkipString = parse
`"`
Expand Down
1 change: 1 addition & 0 deletions src/compiler/Lexer.sig
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
val quotation : bool ref;
val utf8 : bool ref;
val resetLexerState : unit -> unit;
val Token : Lexing.lexbuf -> Parser.token;
2 changes: 1 addition & 1 deletion src/compiler/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ T_LIBOBJS= \
StringCvt.uo Word.uo Word8.uo Word8Vector.uo CharVector.uo \
Word8Array.uo CharArray.uo Obj.uo Nonstdio.uo \
Substring.uo Path.uo Time.uo OS.uo FileSys.uo \
Lexing.uo Parsing.uo PP.uo
Lexing.uo Parsing.uo UTF8.uo PP.uo

T_OBJS= \
Predef.uo Prim_c.uo Symtable.uo Patch.uo Tr_const.uo \
Expand Down
4 changes: 4 additions & 0 deletions src/compiler/Smltop.sml
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,8 @@ val smltop_con_basis =
("loadPath",{ qualid={qual="Meta", id=["loadPath"]}, info=VARname REGULARo}),
("quotation",
{ qualid={qual="Meta", id=["quotation"]}, info=VARname REGULARo}),
("utf8",
{ qualid={qual="Meta", id=["utf8"]}, info=VARname REGULARo}),
("valuepoly",
{ qualid={qual="Meta", id=["valuepoly"]}, info=VARname REGULARo}),
("printVal", { qualid={qual="Meta", id=["printVal"]},info=VARname OVL1TXXo}),
Expand Down Expand Up @@ -300,6 +302,7 @@ val smltop_VE =
("quietdec", trivial_scheme(type_ref type_bool)),
("loadPath", trivial_scheme(type_ref (type_list type_string))),
("quotation", trivial_scheme(type_ref type_bool)),
("utf8", trivial_scheme(type_ref type_bool)),
("valuepoly", trivial_scheme(type_ref type_bool)),
("printVal", sc_bogus),
("printDepth", trivial_scheme(type_ref type_int)),
Expand Down Expand Up @@ -337,6 +340,7 @@ fun resetSMLTopDynEnv() =
("quietdec", repr Exec_phr.quietdec),
("loadPath", repr Mixture.load_path),
("quotation", repr Lexer.quotation),
("utf8", repr Lexer.utf8),
("valuepoly", repr Mixture.value_polymorphism),
("printVal", repr evalPrint),
("printDepth", repr printDepth),
Expand Down
87 changes: 44 additions & 43 deletions src/dynlibs/Makefile
Original file line number Diff line number Diff line change
@@ -1,53 +1,54 @@
MAKE=gmake

all:
cd interface; make
cd intinf; make
cd mgd; make
cd mgdbm; make
cd mmysql; make
cd mpq; make
cd mregex; make
cd msocket; make
cd munix; make
cd interface; $(MAKE)
cd intinf; $(MAKE)
cd mgd; $(MAKE)
cd mgdbm; $(MAKE)
cd mmysql; $(MAKE)
cd mpq; $(MAKE)
cd mregex; $(MAKE)
cd msocket; $(MAKE)
cd munix; $(MAKE)

install:
cd intinf; make install
cd mgd; make install
cd mgdbm; make install
cd mmysql; make install
cd mpq; make install
cd mregex; make install
cd msocket; make install
cd munix; make install
cd intinf; $(MAKE) install
cd mgd; $(MAKE) install
cd mgdbm; $(MAKE) install
cd mmysql; $(MAKE) install
cd mpq; $(MAKE) install
cd mregex; $(MAKE) install
cd msocket; $(MAKE) install
cd munix; $(MAKE) install

uninstall:
cd intinf; make uninstall
cd mgd; make uninstall
cd mgdbm; make uninstall
cd mmysql; make uninstall
cd mpq; make uninstall
cd mregex; make uninstall
cd msocket; make uninstall
cd munix; make uninstall
cd intinf; $(MAKE) uninstall
cd mgd; $(MAKE) uninstall
cd mgdbm; $(MAKE) uninstall
cd mmysql; $(MAKE) uninstall
cd mpq; $(MAKE) uninstall
cd mregex; $(MAKE) uninstall
cd msocket; $(MAKE) uninstall
cd munix; $(MAKE) uninstall

test:
cd interface; make test
cd intinf; make test
cd mgd; make test
cd mgdbm; make test
cd mmysql; make test
cd mpq; make test
cd mregex; make test
cd munix; make test
cd interface; $(MAKE) test
cd intinf; $(MAKE) test
cd mgd; $(MAKE) test
cd mgdbm; $(MAKE) test
cd mmysql; $(MAKE) test
cd mpq; $(MAKE) test
cd mregex; $(MAKE) test
cd munix; $(MAKE) test

clean:
cd crypt; make clean
cd interface; make clean
cd intinf; make clean
cd mgd; make clean
cd mgdbm; make clean
cd mmysql; make clean
cd mpq; make clean
cd mregex; make clean
cd msocket; make clean
cd munix; make clean
cd crypt; $(MAKE) clean
cd interface; $(MAKE) clean
cd intinf; $(MAKE) clean
cd mgd; $(MAKE) clean
cd mgdbm; $(MAKE) clean
cd mmysql; $(MAKE) clean
cd mpq; $(MAKE) clean
cd mregex; $(MAKE) clean
cd msocket; $(MAKE) clean
cd munix; $(MAKE) clean
2 changes: 1 addition & 1 deletion src/dynlibs/intinf/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ intinf.o: intinf.c
$(CC) $(CFLAGS) -c -o intinf.o intinf.c

libmgmp.so: intinf.o
$(DYNLD) -o libmgmp.so intinf.o -L$(GMPLIBDIR) -lgmp -lc
$(DYNLD) -o libmgmp.so intinf.o -L$(GMPLIBDIR) -lgmp

test:
echo $(CURDIR)
Expand Down
2 changes: 1 addition & 1 deletion src/dynlibs/munix/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ munix.o: munix.c
$(CC) $(CFLAGS) -c -o munix.o munix.c

libmunix.so: munix.o
$(DYNLD) -o libmunix.so munix.o -lc
$(DYNLD) -o libmunix.so munix.o

install:
$(INSTALL_DATA) libmunix.so $(DESTDIR)$(LIBDIR)
Expand Down
Binary file modified src/mosmlcmp
Binary file not shown.
Binary file modified src/mosmllex
Binary file not shown.
7 changes: 5 additions & 2 deletions src/mosmllib/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ all: Array.uo Array2.uo ArraySlice.uo Arraysort.uo \
TextIO.uo Time.uo Timer.uo \
Unix.uo Vector.uo VectorSlice.uo \
Weak.uo Word.uo Word8.uo Word8Array.uo Word8ArraySlice.uo \
Word8Vector.uo Word8VectorSlice.uo
Word8Vector.uo Word8VectorSlice.uo UTF8.uo

# Make with the current compiler
current:
Expand Down Expand Up @@ -80,6 +80,8 @@ PackRealLittle.uo: PackRealLittle.ui Word8Array.ui Word8ArraySlice.ui \
TextIO.ui: StringCvt.ui Char.ui
Msp.uo: Msp.ui String.ui StringCvt.ui List.ui Option.ui Vector.ui TextIO.ui \
Int.ui Mosmlcgi.ui Char.ui
UTF8.uo: UTF8.ui String.ui Word.ui CharVector.ui StringCvt.ui Word8.ui \
Int.ui Word8Vector.ui Char.ui
AppleScript.uo: AppleScript.ui
Regex.uo: Regex.ui Word.ui Dynlib.ui List.ui Vector.ui Substring.ui
Time.uo: Time.ui Real.ui StringCvt.ui Char.ui
Expand Down Expand Up @@ -159,12 +161,13 @@ Array2.ui: Vector.ui
ArraySlice.ui: Vector.ui Array.ui VectorSlice.ui
Int.uo: Int.ui String.ui StringCvt.ui Char.ui
Signal.ui: Word.ui
UTF8.ui: Word.ui String.ui Char.ui
Buffer.uo: Buffer.ui String.ui Substring.ui
PackRealBig.ui: Word8Array.ui Word8Vector.ui
Dynlib.uo: Dynlib.ui
Dynarray.uo: Dynarray.ui Array.ui
Word8Vector.ui: Word8.ui
PP.uo: PP.ui String.ui List.ui Vector.ui Array.ui TextIO.ui
PP.uo: PP.ui String.ui UTF8.ui List.ui Vector.ui Array.ui TextIO.ui
Word8ArraySlice.ui: Word8Array.ui Word8.ui Word8Vector.ui \
Word8VectorSlice.ui
Parsing.uo: Parsing.ui Lexing.ui Vector.ui Obj.uo
Expand Down
2 changes: 2 additions & 0 deletions src/mosmllib/PP.sig
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ datatype break_style =
CONSISTENT
| INCONSISTENT

val utf8 : bool ref

val mk_ppstream : ppconsumer -> ppstream
val dest_ppstream : ppstream -> ppconsumer
val add_break : ppstream -> int * int -> unit
Expand Down
Loading