Skip to content

Commit

Permalink
feat: Now identifying UTF8 characters
Browse files Browse the repository at this point in the history
  • Loading branch information
gcarreno committed Jan 15, 2024
1 parent 8b16101 commit e785771
Show file tree
Hide file tree
Showing 9 changed files with 393 additions and 114 deletions.
50 changes: 45 additions & 5 deletions src/text/opp.text.pas
Original file line number Diff line number Diff line change
Expand Up @@ -11,22 +11,62 @@ interface

type
{ TTextFileType }
TTextFileType = (tftUnknnown, tftAnsi, tftUTF8, tftUTF16, tftUTF32);
TTextFileType = (tftUnknown, tftAnsi, tftUTF8, tftUTF16, tftUTF32);

function TextFileTypeToString(const ATextFileType: TTextFileType): String;

resourcestring
rsTextFileTypeUnknown = 'Text File Type Unknown';
rsTextFileTypeAnsi = 'Text File Type Ansi';
rsTextFileTypeUTF8 = 'Text File Type UTF8';
rsTextFileTypeUTF16 = 'Text File Type UTF16';
rsTextFileTypeUTF32 = 'Text File Type UTF32';

type
{ TTextCharType }
TTextCharType = (tctUnknown, tctAnsi, tctUTF8, tctUTF16, tctUTF32);

function TextCharTypeToString(const ATextCharType: TTextCharType): String;

resourcestring
rsTextCharTypeUnknown = 'Text Char Type Unknown';
rsTextCharTypeAnsi = 'Text Char Type Ansi';
rsTextCharTypeUTF8 = 'Text Char Type UTF8';
rsTextCharTypeUTF16 = 'Text Char Type UTF16';
rsTextCharTypeUTF32 = 'Text Char Type UTF32';

type
{ #todo 999 -ogcarreno : Determine if a union is possible or best }
{ TTextCharacter }
TTextCharacter = record
&Type: TTextCharType;
Ansi: Char;
UTF8: String;
UTF16: String;
UTF32: String;
Value: String;
EOF: Boolean;
end;

implementation

function TextFileTypeToString(const ATextFileType: TTextFileType): String;
begin
case ATextFileType of
tftUnknown: Result:= rsTextFileTypeUnknown;
tftAnsi: Result:= rsTextFileTypeAnsi;
tftUTF8: Result:= rsTextFileTypeUTF8;
tftUTF16: Result:= rsTextFileTypeUTF16;
tftUTF32: Result:= rsTextFileTypeUTF32;
end;
end;

function TextCharTypeToString(const ATextCharType: TTextCharType): String;
begin
case ATextCharType of
tctUnknown: Result:= rsTextFileTypeUnknown;
tctAnsi: Result:= rsTextFileTypeAnsi;
tctUTF8: Result:= rsTextFileTypeUTF8;
tctUTF16: Result:= rsTextFileTypeUTF16;
tctUTF32: Result:= rsTextFileTypeUTF32;
end;
end;

end.

75 changes: 71 additions & 4 deletions src/text/opp.text.sourcefile.pas
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,17 @@ interface
{ ETextSourceFileDoesNotExist }
ETextSourceFileDoesNotExist = class(Exception);

{ ETextSourceFilePrematureEOF }
ETextSourceFilePrematureEOF = class(Exception);

{ TTextSourceFile }
TTextSourceFile = class(TObject)
private
FFilename: String;
FSourceFileStream: TFileStream;
FFileType: TTextFileType;

function GetStreamSize: Int64;
protected
public
constructor Create(const AFileName: String);
Expand All @@ -28,11 +34,16 @@ TTextSourceFile = class(TObject)

property Filename: String
read FFilename;
property FileType: TTextFileType
read FFileType;
property Size: Int64
read GetStreamSize;
published
end;

resourcestring
rsETextSourceFileDoesNotExist = 'File "%s" does not exist';
rsETextSourceFilePrematureEOF = 'File "%s" reached premature EOF';

implementation

Expand All @@ -48,8 +59,11 @@ constructor TTextSourceFile.Create(const AFileName: String);
)
);

{ #todo 999 -ogcarreno : This needs to be BOM and UTF aware!! }
FFilename:= AFileName;
FSourceFileStream:= TFileStream.Create(AFileName, fmOpenRead);
{ #todo 999 -ogcarreno : This need to change, but for the current code it will do }
FFileType:= tftAnsi;
end;

destructor TTextSourceFile.Destroy;
Expand All @@ -58,15 +72,68 @@ destructor TTextSourceFile.Destroy;
inherited Destroy;
end;

function TTextSourceFile.GetStreamSize: Int64;
begin
Result:= FSourceFileStream.Size;
end;

function TTextSourceFile.GetNextChar: TTextCharacter;
var
{ #todo 999 -ogcarreno : This need to change, but for the current code it will do }
buffer: Byte;
bytesRead: Int64;
begin
Result.&Type:= tctUnknown;
Result.Ansi := #0;
Result.UTF8 := '';
Result.UTF16:= '';
Result.UTF32:= '';
Result.Value := '';
Result.EOF:= False;

buffer:= 0;

// Get next char(s) and fill record
bytesRead:= FSourceFileStream.Read(buffer, SizeOf(buffer));
if bytesRead = 0 then
begin
Result.EOF:= True;
end
else
begin
case buffer of
$00..$7F:begin
Result.&Type:= tctAnsi;
Result.Value := Char(buffer);
end;
$C2..$DF:begin
Result.&Type:= tctUTF8;
Result.Value := Char(buffer);
bytesRead:= FSourceFileStream.Read(buffer, SizeOf(buffer));
if bytesRead = 0 then raise ETextSourceFilePrematureEOF.Create(rsETextSourceFilePrematureEOF);
Result.Value := Result.Value + Char(buffer);
end;
$E0, $E1..$EF:begin
Result.&Type:= tctUTF8;
Result.Value := Char(buffer);
bytesRead:= FSourceFileStream.Read(buffer, SizeOf(buffer));
if bytesRead = 0 then raise ETextSourceFilePrematureEOF.Create(rsETextSourceFilePrematureEOF);
Result.Value := Result.Value + Char(buffer);
bytesRead:= FSourceFileStream.Read(buffer, SizeOf(buffer));
if bytesRead = 0 then raise ETextSourceFilePrematureEOF.Create(rsETextSourceFilePrematureEOF);
Result.Value := Result.Value + Char(buffer);
end;
$F0, $F1..$F3, $F4:begin
Result.&Type:= tctUTF8;
Result.Value := Char(buffer);
bytesRead:= FSourceFileStream.Read(buffer, SizeOf(buffer));
if bytesRead = 0 then raise ETextSourceFilePrematureEOF.Create(rsETextSourceFilePrematureEOF);
Result.Value := Result.Value + Char(buffer);
bytesRead:= FSourceFileStream.Read(buffer, SizeOf(buffer));
if bytesRead = 0 then raise ETextSourceFilePrematureEOF.Create(rsETextSourceFilePrematureEOF);
Result.Value := Result.Value + Char(buffer);
bytesRead:= FSourceFileStream.Read(buffer, SizeOf(buffer));
if bytesRead = 0 then raise ETextSourceFilePrematureEOF.Create(rsETextSourceFilePrematureEOF);
Result.Value := Result.Value + Char(buffer);
end;
end;
end;
end;

end.
Expand Down
50 changes: 25 additions & 25 deletions src/tokenizing/opp.tokenizing.tokenizer.pas
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ interface
uses
Classes
, SysUtils
, OPP.Text
, OPP.Text.SourceFile
, OPP.States
, OPP.States.StackTokens
, OPP.Tokenizing.Tokens
Expand All @@ -23,8 +25,8 @@ ETokenizingTokenizerStackNotEmpty = class(Exception);
{ TTokenizingTokenizer }
TTokenizingTokenizer = class(Tobject)
private
FStream: TStringStream;
FCurrentChar: Char;
FSSourceFile: TTextSourceFile;
FCurrentChar: TTextCharacter;
FLine: Int64;
FRow: Int64;
FStackTokens: TStatesStackTokens;
Expand All @@ -34,7 +36,7 @@ TTokenizingTokenizer = class(Tobject)
procedure FillEOF(var AToken: TToken);
protected
public
constructor Create(const AStream: TStream);
constructor Create(const ASourceFile: TTextSourceFile);
destructor Destroy; override;

function GetNextToken: TToken;
Expand All @@ -45,20 +47,19 @@ implementation

{ TTokenizingTokenizer }

constructor TTokenizingTokenizer.Create(const AStream: TStream);
constructor TTokenizingTokenizer.Create(const ASourceFile: TTextSourceFile);
begin
FStream:= TStringStream.Create;
FStream.CopyFrom(AStream, AStream.Size);
FStream.Position:= 0;
FCurrentChar:= #0;
FSSourceFile:= ASourceFile;
FCurrentChar.&Type:= tctUnknown;
FCurrentChar.Value:= '';
FCurrentChar.EOF:= False;
FLine:= 0;
FRow:= 0;
FStackTokens:= TStatesStackTokens.Create;
end;

destructor TTokenizingTokenizer.Destroy;
begin
if Assigned(FStream) then FStream.Free;
if Assigned(FStackTokens) then FStackTokens.Free;
inherited Destroy;
end;
Expand Down Expand Up @@ -96,13 +97,15 @@ procedure TTokenizingTokenizer.FillEOF(var AToken: TToken);
end;

function TTokenizingTokenizer.GetNextToken: TToken;
var
bytesRead: Integer = 0;
begin
FillReset(Result);
Result.Error:= teNone;
Result.&Type:= ttUndefined;
Result.Line:= FLine;
Result.Row:= FRow;
Result.Element:= EmptyStr;

// Exit early if nothing to do
if FStream.Size = 0 then
if FSSourceFile.Size = 0 then
begin
Result.&Type:= ttEOF;
exit;
Expand All @@ -111,11 +114,10 @@ function TTokenizingTokenizer.GetNextToken: TToken;
FStackTokens.Push(tsUndefined);
repeat
// Read one char at a time
{ #todo 999 -ogcarreno : This needs to be BOM and UTF aware!! }
bytesRead:= FStream.Read(FCurrentChar, 1);
FCurrentChar:= FSSourceFile.GetNextChar;

// This is EOF
if bytesRead = 0 then
if FCurrentChar.EOF then
begin
case FStackTokens.Peek of
tsUndefined:begin
Expand Down Expand Up @@ -146,7 +148,7 @@ function TTokenizingTokenizer.GetNextToken: TToken;
if FLine = 0 then FLine:= 1;

// Decide per caracter
case FCurrentChar of
case FCurrentChar.Value of
// White Spaces
#9, ' ':begin
if not (FStackTokens.Peek = tsWhiteSpace) then FStackTokens.Push(tsWhiteSpace);
Expand All @@ -155,25 +157,25 @@ function TTokenizingTokenizer.GetNextToken: TToken;
#10, #13:begin
case FStackTokens.Peek of
tsUndefined:begin
if FCurrentChar = #10 then
if FCurrentChar.Value = #10 then
begin
FillEOL(Result);
Result.Element:= FCurrentChar;
Result.Element:= FCurrentChar.Value;
break;
end;
if FCurrentChar = #13 then
if FCurrentChar.Value = #13 then
begin
FillEOL(Result, False);
Result.Element:= FCurrentChar;
Result.Element:= FCurrentChar.Value;
FStackTokens.Push(tsMaybeCRLF);
continue;
end;
end;
tsMaybeCRLF:begin
if FCurrentChar = #10 then
if FCurrentChar.Value = #10 then
begin
FillEOL(Result);
Result.Element:= Result.Element + FCurrentChar;
Result.Element:= Result.Element + FCurrentChar.Value;
FStackTokens.Pop;
break;
end;
Expand All @@ -188,8 +190,6 @@ function TTokenizingTokenizer.GetNextToken: TToken;
FStackTokens.Pop;
end;

writeln('State: ', TokenStateToString(FStackTokens.Peek), ' Char: ', FCurrentChar);

until FStackTokens.Peek = tsUndefined;
if FStackTokens.Count > 1 then
raise ETokenizingTokenizerStackNotEmpty.Create(rsETokenizingTokenizerStackNotEmpty);
Expand Down
Loading

0 comments on commit e785771

Please sign in to comment.