Skip to content

Commit fe65231

Browse files
committed
fix: broken support for non UTF-8 encoding
Affects code in SaveUnicodeFile, OpenTextFile, DetectEncoding and ReadTextfileChunk Refs #2268
1 parent 4a8c53c commit fe65231

File tree

1 file changed

+46
-34
lines changed

1 file changed

+46
-34
lines changed

source/apphelpers.pas

Lines changed: 46 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1307,14 +1307,19 @@ function FormatTimeNumber(Seconds: Double; DisplaySeconds: Boolean; MilliSeconds
13071307
}
13081308
procedure SaveUnicodeFile(Filename: String; Text: String; Encoding: TEncoding);
13091309
var
1310-
Writer: TStringList;
1310+
Writer: TFileStream;
1311+
Bytes: TBytes;
13111312
begin
13121313
// Encoding may be nil when previously loaded via auto-detection
13131314
if not Assigned(Encoding) then
13141315
Encoding := UTF8NoBOMEncoding;
1315-
Writer := TStringList.Create;
1316-
Writer.Text := Text;
1317-
Writer.SaveToFile(Filename, Encoding);
1316+
Bytes := Encoding.GetBytes(Text); // Encode text
1317+
Writer := TFileStream.Create(Filename, fmCreate);
1318+
try
1319+
Writer.WriteBuffer(Bytes[0], Length(Bytes));
1320+
finally
1321+
Writer.Free;
1322+
end;
13181323
end;
13191324

13201325

@@ -1325,56 +1330,62 @@ procedure OpenTextFile(const Filename: String; out Stream: TFileStream; var Enco
13251330
begin
13261331
// Open a textfile and return a stream. Detect its encoding if not passed by the caller
13271332
Stream := TFileStream.Create(Filename, fmOpenRead or fmShareDenyNone);
1328-
//if Encoding = nil then
1329-
// Encoding := DetectEncoding(Stream);
1333+
if Encoding = nil then
1334+
Encoding := DetectEncoding(Stream);
13301335
// If the file contains a BOM, advance the stream's position
13311336
BomLen := 0;
1332-
{if Length(Encoding.GetPreamble) > 0 then begin
1337+
if Length(Encoding.GetPreamble) > 0 then begin
13331338
SetLength(Header, Length(Encoding.GetPreamble));
13341339
Stream.ReadBuffer(Pointer(Header)^, Length(Header));
13351340
if CompareMem(Header, Encoding.GetPreamble, SizeOf(Header)) then
13361341
BomLen := Length(Encoding.GetPreamble);
1337-
end;}
1342+
end;
13381343
Stream.Position := BomLen;
13391344
end;
13401345

13411346

13421347
{**
1343-
Detect stream's content encoding through SynEdit's GetEncoding. Result can be:
1348+
Detect stream's content encoding. Result can be:
13441349
UTF-16 BE with BOM
13451350
UTF-16 LE with BOM
13461351
UTF-8 with or without BOM
1347-
ANSI
1348-
Aimed to work better than WideStrUtils.IsUTF8String() which didn't work in any test case here.
1349-
@see http://en.wikipedia.org/wiki/Byte_Order_Mark
1350-
Could also do that with TEncoding.GetBufferEncoding, but that relies on the file having a BOM
13511352
}
13521353
function DetectEncoding(Stream: TStream): TEncoding;
1353-
{var
1354-
SynEnc: TSynEncoding;
1355-
WithBOM: Boolean;}
1356-
begin
1357-
Result := TEncoding.UTF8
1358-
{ LConvEncoding.GuessEncoding returns string identifiers, not the TEncoding objects
1359-
SynEnc := SynUnicode.GetEncoding(Stream, WithBOM);
1360-
case SynEnc of
1361-
seUTF8: begin
1362-
if WithBOM then
1363-
Result := TEncoding.UTF8
1364-
else
1365-
Result := UTF8NoBOMEncoding;
1366-
end;
1367-
seUTF16LE: Result := TEncoding.Unicode;
1368-
seUTF16BE: Result := TEncoding.BigEndianUnicode;
1369-
seAnsi: Result := TEncoding.ANSI;
1370-
else Result := UTF8NoBOMEncoding;
1371-
end;}
1354+
const
1355+
BOM_UTF8: array[0..2] of Byte = ($EF, $BB, $BF);
1356+
BOM_UTF16LE: array[0..1] of Byte = ($FF, $FE);
1357+
BOM_UTF16BE: array[0..1] of Byte = ($FE, $FF);
1358+
var
1359+
Buffer: array[0..3] of Byte;
1360+
ReadCount: Integer;
1361+
OldPos: Int64;
1362+
begin
1363+
Result := UTF8NoBOMEncoding; // Default if no BOM is found
1364+
1365+
OldPos := Stream.Position;
1366+
Stream.Position := 0;
1367+
try
1368+
ReadCount := Stream.Read(Buffer, SizeOf(Buffer));
1369+
finally
1370+
Stream.Position := OldPos;
1371+
end;
1372+
1373+
if (ReadCount >= 3) and CompareMem(@Buffer[0], @BOM_UTF8[0], 3) then
1374+
Result := TEncoding.UTF8
1375+
else if (ReadCount >= 2) and CompareMem(@Buffer[0], @BOM_UTF16LE[0], 2) then
1376+
Result := TEncoding.Unicode // UTF-16 LE
1377+
else if (ReadCount >= 2) and CompareMem(@Buffer[0], @BOM_UTF16BE[0], 2) then
1378+
Result := TEncoding.BigEndianUnicode // UTF-16 BE
1379+
// Could add detection for UTF-32 BOMs too if needed
1380+
else
1381+
Result := UTF8NoBOMEncoding; // No BOM
13721382
end;
13731383

13741384

13751385
function ReadTextfileChunk(Stream: TFileStream; Encoding: TEncoding; ChunkSize: Int64 = 0): String;
13761386
var
13771387
DataLeft: Int64;
1388+
Bytes: TBytes;
13781389
begin
13791390
// Read a chunk or the complete contents out of a textfile, opened by OpenTextFile()
13801391
if Stream.Size = 0 then begin
@@ -1386,8 +1397,9 @@ function ReadTextfileChunk(Stream: TFileStream; Encoding: TEncoding; ChunkSize:
13861397
if (ChunkSize = 0) or (ChunkSize > DataLeft) then
13871398
ChunkSize := DataLeft;
13881399

1389-
SetLength(Result, ChunkSize);
1390-
Stream.Read(PChar(Result)^, ChunkSize);
1400+
SetLength(Bytes, ChunkSize);
1401+
Stream.ReadBuffer(Bytes[0], Length(Bytes));
1402+
Result := Encoding.GetString(Bytes);
13911403
end;
13921404

13931405

0 commit comments

Comments
 (0)