From af8fa664f71276fb857360531f04b1c7fb101d22 Mon Sep 17 00:00:00 2001
From: tobigun <tobigun@b956fd51-792f-4845-bead-9b4dfca2ff2c>
Date: Sat, 14 Mar 2009 22:10:00 +0000
Subject: - font-engine uses UCS4 internally - more UTf-8 and UCS4 routines in
 UnicodeUtils

git-svn-id: svn://svn.code.sf.net/p/ultrastardx/svn/branches/experimental@1636 b956fd51-792f-4845-bead-9b4dfca2ff2c
---
 unicode/src/base/UFont.pas         | 206 ++++++++++++++++++++++---------------
 unicode/src/base/UUnicodeUtils.pas | 133 +++++++++++++++++++++++-
 2 files changed, 249 insertions(+), 90 deletions(-)

diff --git a/unicode/src/base/UFont.pas b/unicode/src/base/UFont.pas
index 3d6e9be3..346f4a07 100644
--- a/unicode/src/base/UFont.pas
+++ b/unicode/src/base/UFont.pas
@@ -47,6 +47,7 @@ uses
   glext,
   glu,
   sdl,
+  UUnicodeUtils,
   {$IFDEF BITMAP_FONT}
   UTexture,
   {$ENDIF}
@@ -60,7 +61,7 @@ type
   TGLubyteArray = array[0 .. (MaxInt div SizeOf(GLubyte))-1] of GLubyte;
   TGLubyteDynArray = array of GLubyte;
 
-  TWideStringArray = array of WideString;
+  TUCS4StringArray = array of UCS4String;
 
   TGLColor = packed record
     case byte of
@@ -127,33 +128,33 @@ type
       {**
        * Splits lines in Text seperated by newline (char-code #13).
        * @param Text   UTF-8 encoded string
-       * @param Lines  splitted WideString lines
+       * @param Lines  splitted UCS4String lines
        *}
-      procedure SplitLines(const Text: UTF8String; var Lines: TWideStringArray);
+      procedure SplitLines(const Text: UCS4String; var Lines: TUCS4StringArray);
 
       {**
-       * Print an array of WideStrings. Each array-item is a line of text.
+       * Print an array of UCS4Strings. Each array-item is a line of text.
        * Lines of text are seperated by the line-spacing.
        * This is the base function for all text drawing.
        *}
-      procedure Print(const Text: TWideStringArray); overload; virtual;
+      procedure Print(const Text: TUCS4StringArray); overload; virtual;
 
       {**
        * Draws an underline.
        *}
-      procedure DrawUnderline(const Text: WideString); virtual;
+      procedure DrawUnderline(const Text: UCS4String); virtual;
 
       {**
        * Renders (one) line of text.
        *}
-      procedure Render(const Text: WideString); virtual; abstract;
+      procedure Render(const Text: UCS4String); virtual; abstract;
 
       {**
        * Returns the bounds of text-lines contained in Text.
        * @param(Advance  if true the right bound is set to the advance instead
        *   of the minimal right bound.)
        *}
-      function BBox(const Text: TWideStringArray; Advance: boolean): TBoundsDbl; overload; virtual; abstract;
+      function BBox(const Text: TUCS4StringArray; Advance: boolean): TBoundsDbl; overload; virtual; abstract;
 
       {**
        * Resets all user settings to default values.
@@ -188,6 +189,8 @@ type
       {**
        * Prints a text.
        *}
+      procedure Print(const Text: UCS4String); overload;
+      {** UTF-16 version of @link(Print) }
       procedure Print(const Text: WideString); overload;
       {** UTF-8 version of @link(Print) }
       procedure Print(const Text: UTF8String); overload;
@@ -203,6 +206,8 @@ type
        * bigger than the text's width as it additionally contains the advance
        * and glyph-spacing of the last character.
        *}
+      function BBox(const Text: UCS4String; Advance: boolean = true): TBoundsDbl; overload;
+      {** UTF-16 version of @link(BBox) }
       function BBox(const Text: WideString; Advance: boolean = true): TBoundsDbl; overload;
       {** UTF-8 version of @link(BBox) }
       function BBox(const Text: UTF8String; Advance: boolean = true): TBoundsDbl; overload;
@@ -249,9 +254,9 @@ type
       /// Mipmap fonts (size[level+1] = size[level]/2)
       fMipmapFonts: array[0..cMaxMipmapLevel] of TFont;
 
-      procedure Render(const Text: WideString); override;
-      procedure Print(const Text: TWideStringArray); override;
-      function BBox(const Text: TWideStringArray; Advance: boolean): TBoundsDbl; override;
+      procedure Render(const Text: UCS4String); override;
+      procedure Print(const Text: TUCS4StringArray); override;
+      function BBox(const Text: TUCS4StringArray; Advance: boolean): TBoundsDbl; override;
 
       {**
        * Callback called for creation of each mipmap font.
@@ -322,7 +327,7 @@ type
 
   {**
    * Table for storage of max. 256 glyphs.
-   * Used for the second cache level. Indexed by the LSB of the WideChar
+   * Used for the second cache level. Indexed by the LSB of the UCS4Char
    * char-code.
    *}
   PGlyphTable = ^TGlyphTable;
@@ -332,7 +337,7 @@ type
    * Cache for glyphs of a single font.
    * The cached glyphs are stored inside a hash-list.
    * Hashing is performed in two steps:
-   * 1. the least significant byte (LSB) of the WideChar character code
+   * 1. the least significant byte (LSB) of the UCS4Char character code
    * is removed (shr 8) and the result (we call it BaseCode here) looked up in
    * the hash-list.
    * 2. Each entry of the hash-list contains a table with max. 256 entries.
@@ -359,22 +364,22 @@ type
        * Add glyph Glyph with char-code ch to the cache.
        * @returns @true on success, @false otherwise
        *}
-      function AddGlyph(ch: WideChar; const Glyph: TGlyph): boolean;
+      function AddGlyph(ch: UCS4Char; const Glyph: TGlyph): boolean;
 
       {**
        * Removes the glyph with char-code ch from the cache.
        *}
-      procedure DeleteGlyph(ch: WideChar);
+      procedure DeleteGlyph(ch: UCS4Char);
 
       {**
        * Removes the glyph with char-code ch from the cache.
        *}
-      function GetGlyph(ch: WideChar): TGlyph;
+      function GetGlyph(ch: UCS4Char): TGlyph;
 
       {**
        * Checks if a glyph with char-code ch is cached.
        *}
-      function HasGlyph(ch: WideChar): boolean;
+      function HasGlyph(ch: UCS4Char): boolean;
 
       {**
        * Remove and free all cached glyphs. If KeepBaseSet is set to
@@ -408,13 +413,13 @@ type
        * Retrieves a cached glyph with char-code ch from cache.
        * If the glyph is not already cached, it is loaded with LoadGlyph().
        *}
-      function GetGlyph(ch: WideChar): TGlyph;
+      function GetGlyph(ch: UCS4Char): TGlyph;
 
       {**
        * Callback to create (load) a glyph with char-code ch.
        * Implemented by subclasses.
        *}
-      function LoadGlyph(ch: WideChar): TGlyph; virtual; abstract;
+      function LoadGlyph(ch: UCS4Char): TGlyph; virtual; abstract;
 
     public
       constructor Create();
@@ -436,6 +441,7 @@ type
    *}
   TFTGlyph = class(TGlyph)
     private
+      fCharCode:  UCS4Char;     //**< Char code
       fCharIndex: FT_UInt;      //**< Freetype specific char-index (<> char-code)
       fDisplayList: GLuint;     //**< Display-list ID
       fTexture: GLuint;         //**< Texture ID
@@ -477,7 +483,7 @@ type
        * Creates a glyph with char-code ch from font Font.
        * @param LoadFlags  flags passed to FT_Load_Glyph()
        *}
-      constructor Create(Font: TFTFont; ch: WideChar; Outset: single;
+      constructor Create(Font: TFTFont; ch: UCS4Char; Outset: single;
                          LoadFlags: FT_Int32);
       destructor Destroy(); override;
 
@@ -507,10 +513,10 @@ type
       fUseDisplayLists: boolean;    //**< true: use display-lists, false: direct drawing
 
       {** @seealso TCachedFont.LoadGlyph }
-      function LoadGlyph(ch: WideChar): TGlyph; override;
+      function LoadGlyph(ch: UCS4Char): TGlyph; override;
 
-      procedure Render(const Text: WideString); override;
-      function BBox(const Text: TWideStringArray; Advance: boolean): TBoundsDbl; override;
+      procedure Render(const Text: UCS4String); override;
+      function BBox(const Text: TUCS4StringArray; Advance: boolean): TBoundsDbl; override;
 
       function GetHeight(): single; override;
       function GetAscender(): single; override;
@@ -585,9 +591,9 @@ type
       procedure ResetIntern();
       
   protected
-      procedure DrawUnderline(const Text: WideString); override;
-      procedure Render(const Text: WideString); override;
-      function BBox(const Text: TWideStringArray; Advance: boolean): TBoundsDbl; override;
+      procedure DrawUnderline(const Text: UCS4String); override;
+      procedure Render(const Text: UCS4String); override;
+      function BBox(const Text: TUCS4StringArray; Advance: boolean): TBoundsDbl; override;
 
       function GetHeight(): single; override;
       function GetAscender(): single; override;
@@ -672,7 +678,7 @@ type
 
       procedure ResetIntern();
 
-      procedure RenderChar(ch: WideChar; var AdvanceX: real);
+      procedure RenderChar(ch: UCS4Char; var AdvanceX: real);
 
       {**
        * Load font widths from an info file.
@@ -682,8 +688,8 @@ type
       procedure LoadFontInfo(const InfoFile: AnsiString);
 
     protected
-      procedure Render(const Text: WideString); override;
-      function BBox(const Text: TWideStringArray; Advance: boolean): TBoundsDbl; override;
+      procedure Render(const Text: UCS4String); override;
+      function BBox(const Text: TUCS4StringArray; Advance: boolean): TBoundsDbl; override;
 
       function GetHeight(): single; override;
       function GetAscender(): single; override;
@@ -801,37 +807,61 @@ begin
   ResetIntern();
 end;
 
-procedure TFont.SplitLines(const Text: UTF8String; var Lines: TWideStringArray);
+procedure TFont.SplitLines(const Text: UCS4String; var Lines: TUCS4StringArray);
 var
-  LineList: TStringList;
-  LineIndex: integer;
+  CharIndex: integer;
+  LineStart: integer;
+  LineLength: integer;
+  EOT: boolean; // End-Of-Text
 begin
-  // split lines on newline (there is no WideString version of ExtractStrings)
-  LineList := TStringList.Create();
-  ExtractStrings([#13], [], PChar(Text), LineList);
+  // split lines on newline (there is no UCS4String version of ExtractStrings)
+  SetLength(Lines, 0);
+  EOT := false;
+  LineStart := 0;
+
+  for CharIndex := 0 to High(Text) do
+  begin
+    // check for end of text (UCS4Strings are zero-terminated)
+    if (CharIndex = High(Text)) then
+      EOT := true;
+
+    // check for newline (carriage return (#13)) or end of text
+    if (Text[CharIndex] = 13) or EOT then
+    begin
+      LineLength := CharIndex - LineStart;
+      // check if last character was a newline
+      if (EOT and (LineLength = 0)) then
+        Break;      
+
+      // copy line (even if LineLength is 0)
+      SetLength(Lines, Length(Lines)+1);
+      Lines[High(Lines)] := UCS4Copy(Text, LineStart, LineLength);
 
-  // create an array of WideStrins from the UTF-8 string-list
-  SetLength(Lines, LineList.Count);
-  for LineIndex := 0 to LineList.Count-1 do
-    Lines[LineIndex] := UTF8Decode(LineList[LineIndex]);
-  LineList.Free();
+      LineStart := CharIndex+1;
+    end;
+  end;
 end;
 
-function TFont.BBox(const Text: UTF8String; Advance: boolean): TBoundsDbl;
+function TFont.BBox(const Text: UCS4String; Advance: boolean): TBoundsDbl;
 var
-  LineArray: TWideStringArray;
+  LineArray: TUCS4StringArray;
 begin
   SplitLines(Text, LineArray);
   Result := BBox(LineArray, Advance);
   SetLength(LineArray, 0);
 end;
 
+function TFont.BBox(const Text: UTF8String; Advance: boolean): TBoundsDbl;
+begin
+  Result := BBox(UTF8Decode(Text), Advance);
+end;
+
 function TFont.BBox(const Text: WideString; Advance: boolean): TBoundsDbl;
 begin
-  Result := BBox(UTF8Encode(Text), Advance);
+  Result := BBox(WideStringToUCS4String(Text), Advance);
 end;
 
-procedure TFont.Print(const Text: TWideStringArray);
+procedure TFont.Print(const Text: TUCS4StringArray);
 var
   LineIndex: integer;
 begin
@@ -912,21 +942,26 @@ begin
   glPopAttrib();
 end;
 
-procedure TFont.Print(const Text: UTF8String);
+procedure TFont.Print(const Text: UCS4String);
 var
-  LineArray: TWideStringArray;
+  LineArray: TUCS4StringArray;
 begin
   SplitLines(Text, LineArray);
   Print(LineArray);
   SetLength(LineArray, 0);
 end;
 
+procedure TFont.Print(const Text: UTF8String);
+begin
+  Print(UTF8Decode(Text));
+end;
+
 procedure TFont.Print(const Text: WideString);
 begin
-  Print(UTF8Encode(Text));
+  Print(WideStringToUCS4String(Text));
 end;
 
-procedure TFont.DrawUnderline(const Text: WideString);
+procedure TFont.DrawUnderline(const Text: UCS4String);
 var
   UnderlineY1, UnderlineY2: single;
   Bounds: TBoundsDbl;
@@ -1194,7 +1229,7 @@ begin
   glScalef(MipmapScale, MipmapScale, 0);
 end;
 
-procedure TScalableFont.Print(const Text: TWideStringArray);
+procedure TScalableFont.Print(const Text: TUCS4StringArray);
 begin
   glPushMatrix();
 
@@ -1210,12 +1245,12 @@ begin
   glPopMatrix();
 end;
 
-procedure TScalableFont.Render(const Text: WideString);
+procedure TScalableFont.Render(const Text: UCS4String);
 begin
   Assert(false, 'Unused TScalableFont.Render() was called');
 end;
 
-function TScalableFont.BBox(const Text: TWideStringArray; Advance: boolean): TBoundsDbl;
+function TScalableFont.BBox(const Text: TUCS4StringArray; Advance: boolean): TBoundsDbl;
 begin
   Result := fBaseFont.BBox(Text, Advance);
   Result.Left   := Result.Left * fScale * fAspect;
@@ -1346,7 +1381,7 @@ begin
   inherited;
 end;
 
-function TCachedFont.GetGlyph(ch: WideChar): TGlyph;
+function TCachedFont.GetGlyph(ch: UCS4Char): TGlyph;
 begin
   Result := fCache.GetGlyph(ch);
   if (Result = nil) then
@@ -1372,7 +1407,7 @@ constructor TFTFont.Create(
     Size: integer; Outset: single;
     LoadFlags: FT_Int32);
 var
-  i: WideChar;
+  ch: UCS4Char;
 begin
   inherited Create();
 
@@ -1400,8 +1435,8 @@ begin
   ResetIntern();
 
   // pre-cache some commonly used glyphs (' ' - '~')
-  for i := #32 to #126 do
-    fCache.AddGlyph(i, TFTGlyph.Create(Self, i, Outset, LoadFlags));
+  for ch := 32 to 126 do
+    fCache.AddGlyph(ch, TFTGlyph.Create(Self, ch, Outset, LoadFlags));
 end;
 
 destructor TFTFont.Destroy();
@@ -1424,15 +1459,15 @@ begin
   ResetIntern();
 end;
 
-function TFTFont.LoadGlyph(ch: WideChar): TGlyph;
+function TFTFont.LoadGlyph(ch: UCS4Char): TGlyph;
 begin
   Result := TFTGlyph.Create(Self, ch, Outset, fLoadFlags);
 end;
 
-function TFTFont.BBox(const Text: TWideStringArray; Advance: boolean): TBoundsDbl;
+function TFTFont.BBox(const Text: TUCS4StringArray; Advance: boolean): TBoundsDbl;
 var
   Glyph, PrevGlyph: TFTGlyph;
-  TextLine: WideString;
+  TextLine: UCS4String;
   LineYOffset: single;
   LineIndex, CharIndex: integer;
   LineBounds: TBoundsDbl;
@@ -1462,7 +1497,7 @@ begin
     LineBounds.Top    := 0;
 
     // for each glyph image, compute its bounding box
-    for CharIndex := 1 to Length(TextLine) do
+    for CharIndex := 0 to LengthUCS4(TextLine)-1 do
     begin
       Glyph := TFTGlyph(GetGlyph(TextLine[CharIndex]));
       if (Glyph <> nil) then
@@ -1480,9 +1515,9 @@ begin
           LineBounds.Left := LineBounds.Right + Glyph.Bounds.Left;
 
         // update right bound
-        if (CharIndex < Length(TextLine)) or  // not the last character
-           (TextLine[CharIndex] = ' ') or     // on space char (Bounds.Right = 0)
-           Advance then                       // or in advance mode
+        if (CharIndex < LengthUCS4(TextLine)-1) or  // not the last character
+           (TextLine[CharIndex] = Ord(' ')) or      // on space char (Bounds.Right = 0)
+           Advance then                             // or in advance mode
         begin
           // add advance and glyph spacing
           LineBounds.Right := LineBounds.Right + Glyph.Advance.x + GlyphSpacing
@@ -1540,7 +1575,7 @@ begin
     Result.Bottom := 0.0;
 end;
 
-procedure TFTFont.Render(const Text: WideString);
+procedure TFTFont.Render(const Text: UCS4String);
 var
   CharIndex: integer;
   Glyph, PrevGlyph: TFTGlyph;
@@ -1550,7 +1585,7 @@ begin
   PrevGlyph := nil;
 
   // draw current line
-  for CharIndex := 1 to Length(Text) do
+  for CharIndex := 0 to LengthUCS4(Text)-1 do
   begin
     Glyph := TFTGlyph(GetGlyph(Text[CharIndex]));
     if (Assigned(Glyph)) then
@@ -1705,7 +1740,7 @@ begin
   ResetIntern();
 end;
 
-procedure TFTOutlineFont.DrawUnderline(const Text: WideString);
+procedure TFTOutlineFont.DrawUnderline(const Text: UCS4String);
 var
   CurrentColor: TGLColor;
   OutlineColor: TGLColor;
@@ -1730,7 +1765,7 @@ begin
   glPopMatrix();
 end;
 
-procedure TFTOutlineFont.Render(const Text: WideString);
+procedure TFTOutlineFont.Render(const Text: UCS4String);
 var
   CurrentColor: TGLColor;
   OutlineColor: TGLColor;
@@ -1770,7 +1805,7 @@ begin
   fInnerFont.FlushCache(KeepBaseSet);
 end;
 
-function TFTOutlineFont.BBox(const Text: TWideStringArray; Advance: boolean): TBoundsDbl;
+function TFTOutlineFont.BBox(const Text: TUCS4StringArray; Advance: boolean): TBoundsDbl;
 begin
   Result := fOutlineFont.BBox(Text, Advance);
 end;
@@ -2151,13 +2186,14 @@ begin
   FT_Done_Glyph(Glyph);
 end;
 
-constructor TFTGlyph.Create(Font: TFTFont; ch: WideChar; Outset: single;
+constructor TFTGlyph.Create(Font: TFTFont; ch: UCS4Char; Outset: single;
     LoadFlags: FT_Int32);
 begin
   inherited Create();
 
   fFont := Font;
   fOutset := Outset;
+  fCharCode := ch;
 
   // get the Freetype char-index (use default UNICODE charmap)
   fCharIndex := FT_Get_Char_Index(Font.fFace, FT_ULONG(ch));
@@ -2336,7 +2372,7 @@ begin
   InsertPos := fHash.Count;
 end;
 
-function TGlyphCache.AddGlyph(ch: WideChar; const Glyph: TGlyph): boolean;
+function TGlyphCache.AddGlyph(ch: UCS4Char; const Glyph: TGlyph): boolean;
 var
   BaseCode:  cardinal;
   GlyphCode: integer;
@@ -2346,7 +2382,7 @@ var
 begin
   Result := false;
 
-  BaseCode := cardinal(ch) shr 8;
+  BaseCode := Ord(ch) shr 8;
   GlyphTable := FindGlyphTable(BaseCode, InsertPos);
   if (GlyphTable = nil) then
   begin
@@ -2356,7 +2392,7 @@ begin
   end;
 
   // get glyph table offset
-  GlyphCode := cardinal(ch) and $FF;
+  GlyphCode := Ord(ch) and $FF;
   // insert glyph into table if not present
   if (GlyphTable[GlyphCode] = nil) then
   begin
@@ -2365,19 +2401,19 @@ begin
   end;
 end;
 
-procedure TGlyphCache.DeleteGlyph(ch: WideChar);
+procedure TGlyphCache.DeleteGlyph(ch: UCS4Char);
 var
   Table: PGlyphTable;
   TableIndex, GlyphIndex: integer;
   TableEmpty: boolean;
 begin
   // find table
-  Table := FindGlyphTable(cardinal(ch) shr 8, TableIndex);
+  Table := FindGlyphTable(Ord(ch) shr 8, TableIndex);
   if (Table = nil) then
     Exit;
 
   // find glyph    
-  GlyphIndex := cardinal(ch) and $FF;
+  GlyphIndex := Ord(ch) and $FF;
   if (Table[GlyphIndex] <> nil) then
   begin
     // destroy glyph
@@ -2402,19 +2438,19 @@ begin
   end;
 end;
 
-function TGlyphCache.GetGlyph(ch: WideChar): TGlyph;
+function TGlyphCache.GetGlyph(ch: UCS4Char): TGlyph;
 var
   InsertPos: integer;
   Table: PGlyphTable;
 begin
-  Table := FindGlyphTable(cardinal(ch) shr 8, InsertPos);
+  Table := FindGlyphTable(Ord(ch) shr 8, InsertPos);
   if (Table = nil) then
     Result := nil
   else
-    Result := Table[cardinal(ch) and $FF];
+    Result := Table[Ord(ch) and $FF];
 end;
 
-function TGlyphCache.HasGlyph(ch: WideChar): boolean;
+function TGlyphCache.HasGlyph(ch: UCS4Char): boolean;
 begin
   Result := (GetGlyph(ch) <> nil);
 end;
@@ -2540,11 +2576,11 @@ begin
   Stream.Free;
 end;
 
-function TBitmapFont.BBox(const Text: TWideStringArray; Advance: boolean): TBoundsDbl;
+function TBitmapFont.BBox(const Text: TUCS4StringArray; Advance: boolean): TBoundsDbl;
 var
   LineIndex, CharIndex: integer;
   CharCode: cardinal;
-  Line: WideString;
+  Line: UCS4String;
   LineWidth: double;
 begin
   Result.Left := 0;
@@ -2556,7 +2592,7 @@ begin
   begin
     Line := Text[LineIndex];
     LineWidth := 0;
-    for CharIndex := 1 to Length(Line) do
+    for CharIndex := 0 to LengthUCS4(Line)-1 do
     begin
       CharCode := Ord(Line[CharIndex]);
       if (CharCode < Length(fWidths)) then
@@ -2567,7 +2603,7 @@ begin
   end;
 end;
 
-procedure TBitmapFont.RenderChar(ch: WideChar; var AdvanceX: real);
+procedure TBitmapFont.RenderChar(ch: UCS4Char; var AdvanceX: real);
 var
   TexX, TexY:        real;
   TexR, TexB:        real;
@@ -2659,20 +2695,20 @@ begin
   AdvanceX := AdvanceX + GlyphWidth;
 end;
 
-procedure TBitmapFont.Render(const Text: WideString);
+procedure TBitmapFont.Render(const Text: UCS4String);
 var
   CharIndex: integer;
   AdvanceX: real;
 begin
   // if there is no text do nothing
-  if (Text = '') then
+  if (Text = nil) or (Text[0] = 0) then
     Exit;
 
   //Save the current color and alpha (for reflection)
   glGetFloatv(GL_CURRENT_COLOR, @fTempColor);
 
   AdvanceX := 0;
-  for CharIndex := 1 to Length(Text) do
+  for CharIndex := 0 to LengthUCS4(Text)-1 do
   begin
     RenderChar(Text[CharIndex], AdvanceX);
   end;
diff --git a/unicode/src/base/UUnicodeUtils.pas b/unicode/src/base/UUnicodeUtils.pas
index 01c279bd..26f240a9 100644
--- a/unicode/src/base/UUnicodeUtils.pas
+++ b/unicode/src/base/UUnicodeUtils.pas
@@ -34,11 +34,11 @@ interface
 {$I switches.inc}
 
 uses
-  SysUtils
 {$IFDEF MSWINDOWS}
-  , Windows
+  Windows,
 {$ENDIF}
-  ;
+  SysUtils;
+  
 (*
  * Character classes
  *)
@@ -58,6 +58,19 @@ function IsPunctuationChar(ch: UCS4Char): boolean; overload;
 function IsControlChar(ch: WideChar): boolean; overload;
 function IsControlChar(ch: UCS4Char): boolean; overload;
 
+{**
+ * Checks if the given string is a valid UTF-8 string.
+ * If an ANSI encoded string (with char codes >= 128) is passed, the
+ * function will most probably return false, as most ANSI strings sequences
+ * are illegal in UTF-8.
+ *}
+function IsUTF8String(const str: AnsiString): boolean;
+
+{**
+ * Checks if the string is composed of ASCII characters.
+ *}
+function IsASCIIString(const str: AnsiString): boolean;
+
 {*
  * String format conversion
  *}
@@ -71,6 +84,12 @@ function UCS4ToUTF8String(ch: UCS4Char): UTF8String; overload;
  *}
 function LengthUTF8(const str: UTF8String): integer;
 
+{**
+ * Returns the length of an UCS4String. Note that Length(UCS4String) returns
+ * the length+1 as UCS4Strings are zero-terminated.
+ *}
+function LengthUCS4(const str: UCS4String): integer;
+
 function UTF8CompareStr(const S1, S2: UTF8String): integer;
 function UTF8CompareText(const S1, S2: UTF8String): integer;
 
@@ -93,12 +112,19 @@ function UCS4UpperCase(ch: UCS4Char): UCS4Char; overload;
 function UCS4UpperCase(const str: UCS4String): UCS4String; overload;
 
 {**
- *
+ * Converts a UCS4Char to an UCS4String.
+ * Note that UCS4Strings are zero-terminated dynamic arrays.
  *}
 function UCS4CharToString(ch: UCS4Char): UCS4String;
 
-(*
+{**
+ * Copies a segment of str starting with Index with Count characters.
+ * Note: Do not use Copy() to copy UCS4Strings as the result will not contain
+ * a trailing #0 character and hence is invalid.  
+ *}
+function UCS4Copy(const str: UCS4String; Index: Integer = 0; Count: Integer = -1): UCS4String;
 
+(*
  * Converts a WideString to its upper-case representation.
  * Wrapper for WideUpperCase. Needed because some plattforms have problems with
  * unicode support.
@@ -199,6 +225,78 @@ begin
   Result := IsControlChar(WideChar(Ord(ch)));
 end;
 
+
+function IsUTF8String(const str: AnsiString): boolean;
+
+  // find the most significant zero bit (Result: [7..-1])
+  function FindZeroMSB(b: byte): integer;
+  var
+    Mask: byte;
+  begin
+    Mask := $80;
+    Result := 7;
+    while (b and Mask <> 0) do
+    begin
+      Mask := Mask shr 1;
+      Dec(Result);
+    end;
+  end;
+
+var
+  I: integer;
+  ZeroBit: integer;
+  SeqCount: integer; // number of trailing bytes to follow
+begin
+  Result := false;
+  SeqCount := 0;
+
+  for I := 1 to Length(str) do
+  begin
+    if (str[I] >= #128) then
+    begin
+      ZeroBit := FindZeroMSB(Ord(str[I]));
+      // trailing byte expected
+      if (SeqCount > 0) then
+      begin
+        // check if trailing byte has pattern 10xxxxxx
+        if (ZeroBit <> 6) then
+          Exit;
+        Dec(SeqCount);
+      end
+      else // leading byte expected
+      begin
+        // check if pattern is one of 110xxxxx/1110xxxx/11110xxx
+        if (ZeroBit > 5) or (ZeroBit < 3) then
+          Exit;
+        // calculate number of trailing bytes (1, 2 or 3)
+        SeqCount := 6 - ZeroBit;
+      end;
+    end;
+  end;
+
+  // trailing bytes missing?
+  if (SeqCount > 0) then
+    Exit;
+
+  Result := true;
+end;
+
+function IsASCIIString(const str: AnsiString): boolean;
+var
+  I: integer;
+begin
+  for I := 1 to Length(str) do
+  begin
+    if (str[I] >= #128) then
+    begin
+      Result := false;
+      Exit;
+    end;
+  end;    
+  Result := true;
+end;
+
+
 function UTF8ToUCS4String(const str: UTF8String): UCS4String;
 begin
   Result := WideStringToUCS4String(UTF8Decode(str));
@@ -219,6 +317,11 @@ begin
   Result := Length(UTF8ToUCS4String(str));
 end;
 
+function LengthUCS4(const str: UCS4String): integer;
+begin
+  Result := High(str);
+end;
+
 function UTF8CompareStr(const S1, S2: UTF8String): integer;
 begin
   // FIXME
@@ -284,6 +387,26 @@ begin
   Result[1] := 0;
 end;
 
+function UCS4Copy(const str: UCS4String; Index: Integer; Count: Integer): UCS4String;
+var
+  I: integer;
+  MaxCount: integer;
+begin
+  // calculate max. copy count
+  MaxCount := LengthUCS4(str)-Index;
+  if (MaxCount < 0) then
+    MaxCount := 0;
+  // adjust copy count
+  if (Count > MaxCount) or (Count < 0) then
+    Count := MaxCount;
+
+  // copy (and add zero terminator)
+  SetLength(Result, Count + 1);
+  for I := 0 to Count-1 do
+    Result[I] := str[Index+I];
+  Result[Count] := 0;
+end;
+
 function WideStringUpperCase(ch: WideChar): WideString;
 begin
   // If WideChar #0 is converted to a WideString in Delphi, a string with
-- 
cgit v1.2.3