Skip to content

Commit 92632ef

Browse files
author
Dan Smith
committed
Utf-8 -> Windows-1252
1 parent 9b71993 commit 92632ef

2 files changed

Lines changed: 108 additions & 31 deletions

File tree

externals/coda-oss/modules/c++/str/include/str/Encoding.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,12 @@ inline void strto8(const std::u32string& s, sys::U8string& result)
124124
utf32to8(s, result);
125125
}
126126

127+
/*
128+
Using the utf8:: library for these causes compiler-errors on some platforms.
129+
We don't really need them anyway: Linux has good support for UTF-8 and
130+
our existing code-bases don't use UTF-16 on Windows; so UTF-8 to
131+
Windows-1252 will be good enough.
132+
127133
void utf8to16(sys::U8string::const_pointer, size_t, std::u16string&);
128134
inline void utf8to16(const sys::U8string& s, std::u16string& result)
129135
{
@@ -135,6 +141,7 @@ inline void utf8to32(const sys::U8string& s, std::u32string& result)
135141
{
136142
utf8to32(s.c_str(), s.size(), result);
137143
}
144+
*/
138145

139146
//////////////////////////////////////////////////////////////////////////////////////////
140147

externals/coda-oss/modules/c++/str/source/Encoding.cpp

Lines changed: 101 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
* =========================================================================
44
*
55
* (C) Copyright 2004 - 2014, MDA Information Systems LLC
6+
* (C) Copyright 2020, 2021, 2022, Maxar Technologies, Inc.
67
*
78
* str-c++ is free software; you can redistribute it and/or modify
89
* it under the terms of the GNU Lesser General Public License as published by
@@ -22,10 +23,6 @@
2223

2324
#include <assert.h>
2425

25-
#ifdef _WIN32
26-
#include <comdef.h> // _bstr_t
27-
#endif
28-
2926
#include <map>
3027
#include <locale>
3128
#include <stdexcept>
@@ -47,7 +44,7 @@ static inline str::U8string utf8_(uint32_t ch)
4744
str::utf32to8(s, retval);
4845
return retval;
4946
};
50-
static const std::map<uint32_t, sys::U8string> Windows1252_x80_x9F_to_u8string{
47+
static const std::map<std::u32string::value_type, sys::U8string> Windows1252_x80_x9F_to_u8string{
5148
{0x80, utf8_(0x20AC) } // EURO SIGN
5249
// , {0x81, replacement_character } // UNDEFINED
5350
, {0x82, utf8_(0x201A) } // SINGLE LOW-9 QUOTATION MARK
@@ -89,6 +86,28 @@ static constexpr sys::U8string::value_type cast(uint8_t ch)
8986
static_assert(sizeof(decltype(ch)) == sizeof(sys::U8string::value_type), "sizeof(uint8_t) != sizeof(Char8_t)");
9087
return static_cast<sys::U8string::value_type>(ch);
9188
}
89+
90+
static std::map<std::u32string::value_type, sys::U8string> Windows1252_to_u8string()
91+
{
92+
auto retval = Windows1252_x80_x9F_to_u8string;
93+
94+
// Add the ISO8859-1 values to the map too. 1) We're already looking
95+
// in the map anyway for Windows-1252 characters. 2) Need map
96+
// entires for conversion from UTF-8 to Windows-1252.
97+
for (uint32_t ch_ = 0xA0; ch_ <= 0xff; ch_++)
98+
{
99+
// ISO8859-1 can be converted to UTF-8 with bit-twiddling
100+
const auto ch = static_cast<uint8_t>(ch_);
101+
102+
// https://stackoverflow.com/questions/4059775/convert-iso-8859-1-strings-to-utf-8-in-c-c
103+
// *out++=0xc2+(*in>0xbf), *out++=(*in++&0x3f)+0x80;
104+
sys::U8string s{cast(0xc2 + (ch > 0xbf)), cast((ch & 0x3f) + 0x80)}; // ISO8859-1
105+
retval[ch_] = std::move(s);
106+
}
107+
108+
return retval;
109+
}
110+
92111
static sys::U8string fromWindows1252(uint8_t ch)
93112
{
94113
// ASCII is the same in UTF-8
@@ -97,15 +116,7 @@ static sys::U8string fromWindows1252(uint8_t ch)
97116
return sys::U8string{cast(ch)}; // ASCII
98117
}
99118

100-
// ISO8859-1 can be converted to UTF-8 with bit-twiddling
101-
if (ch > 0x9F)
102-
{
103-
// https://stackoverflow.com/questions/4059775/convert-iso-8859-1-strings-to-utf-8-in-c-c
104-
// *out++=0xc2+(*in>0xbf), *out++=(*in++&0x3f)+0x80;
105-
return sys::U8string{cast(0xc2 + (ch > 0xbf)), cast((ch & 0x3f) + 0x80)}; // ISO8859-1
106-
}
107-
108-
static const auto map = Windows1252_x80_x9F_to_u8string;
119+
static const auto map = Windows1252_to_u8string();
109120
const auto it = map.find(ch);
110121
if (it != map.end())
111122
{
@@ -135,6 +146,79 @@ void str::windows1252to8(W1252string::const_pointer p, size_t sz, sys::U8string&
135146
}
136147
}
137148

149+
template<typename TKey, typename TValue>
150+
std::map<TValue, TKey> kv_to_vk(const std::map<TKey, TValue>& kv)
151+
{
152+
std::map<TValue, TKey> retval;
153+
for (const auto& p : kv)
154+
{
155+
retval[p.second] = p.first;
156+
}
157+
return retval;
158+
}
159+
160+
// Keeping this "static" for now, don't want to encouarge this converstion. Client
161+
// access is via str::toString().
162+
static void toWindows1252(str::U8string::const_pointer p, size_t sz, str::W1252string& result)
163+
{
164+
for (size_t i = 0; i < sz; i++)
165+
{
166+
// ASCII is the same in UTF-8
167+
if (p[i] < static_cast<str::U8string::value_type>(0x80))
168+
{
169+
result += static_cast<str::W1252string::value_type>(p[i]); // ASCII
170+
continue;
171+
}
172+
173+
constexpr auto invalid = static_cast<str::W1252string::value_type>(0x7F); // <DEL>
174+
if (!(i + i < sz))
175+
{
176+
// No remaining bytes, invalid UTF-8 encoding
177+
result += invalid;
178+
return;
179+
}
180+
181+
// https://en.wikipedia.org/wiki/UTF-8
182+
const auto b1 = static_cast<uint8_t>(p[i]);
183+
i++; // move to second byte
184+
if (b1 >= 0xE0) // 1110xxxx
185+
{
186+
// not a two-byte sequence, nothing to convert to Windows-1252
187+
result += invalid; // <DEL>
188+
189+
i++; // skip third byte
190+
if (b1 >= 0xF0) // 1111xxx
191+
{
192+
i++; // skip fourth byte
193+
}
194+
continue;
195+
}
196+
197+
const auto b2 = static_cast<uint8_t>(p[i]);
198+
if (b2 < 0x80) // 10xxxxxx
199+
{
200+
// invalid second byte
201+
result += invalid; // <DEL>
202+
continue;
203+
}
204+
205+
const str::U8string utf8{cast(b1), cast(b2)};
206+
207+
static const auto map = kv_to_vk(Windows1252_to_u8string());
208+
const auto it = map.find(utf8);
209+
if (it != map.end())
210+
{
211+
result += static_cast<str::W1252string::value_type>(it->second);
212+
}
213+
else
214+
{
215+
// UTF-8 character can't be converted to Windows-1252
216+
result += invalid; // <DEL>
217+
}
218+
}
219+
}
220+
221+
138222
struct back_inserter final
139223
{
140224
sys::U8string* container = nullptr; // pointer instead of reference for copy
@@ -173,17 +257,6 @@ void str::utf32to8(std::u32string::const_pointer p, size_t sz, sys::U8string& re
173257
utf8::utf32to8(p, p + sz, back_inserter(result));
174258
}
175259

176-
void str::utf8to16(sys::U8string::const_pointer p, size_t sz, std::u16string& result)
177-
{
178-
auto p8 = cast<const uint8_t*>(p);
179-
utf8::utf8to16(p8, p8 + sz, std::back_inserter(result));
180-
}
181-
void str::utf8to32(sys::U8string::const_pointer p, size_t sz, std::u32string& result)
182-
{
183-
auto p8 = cast<const uint8_t*>(p);
184-
utf8::utf8to32(p8, p8 + sz, std::back_inserter(result));
185-
}
186-
187260
inline void wsto8_(std::u16string::const_pointer begin, std::u16string::const_pointer end, sys::U8string& result)
188261
{
189262
utf8::utf16to8(begin, end, back_inserter(result));
@@ -231,12 +304,9 @@ std::string str::toString(const str::U8string& utf8)
231304
auto platform = details::Platform; // "conditional expression is constant"
232305
if (platform == details::PlatformType::Windows)
233306
{
234-
#ifdef _WIN32
235-
std::u16string utf16;
236-
utf8to16(utf8, utf16);
237-
const _bstr_t s(c_str<std::wstring::const_pointer>(utf16)); // wchar_t is UTF-16 on Windows
238-
return static_cast<std::string::const_pointer>(s);
239-
#endif
307+
str::W1252string w1252;
308+
toWindows1252(utf8.c_str(), utf8.length(), w1252);
309+
return c_str<std::string::const_pointer>(w1252); // copy
240310
}
241311
else if (platform == details::PlatformType::Linux)
242312
{

0 commit comments

Comments
 (0)