33 * =========================================================================
44 *
55 * (C) Copyright 2004 - 2014, MDA Information Systems LLC
6+ * (C) Copyright 2020, 2021, 2022, Maxar Technologies, Inc.
67 *
78 * str-c++ is free software; you can redistribute it and/or modify
89 * it under the terms of the GNU Lesser General Public License as published by
2223
2324#include < assert.h>
2425
25- #ifdef _WIN32
26- #include < comdef.h> // _bstr_t
27- #endif
28-
2926#include < map>
3027#include < locale>
3128#include < stdexcept>
@@ -47,7 +44,7 @@ static inline str::U8string utf8_(uint32_t ch)
4744 str::utf32to8 (s, retval);
4845 return retval;
4946};
50- static const std::map<uint32_t , sys::U8string> Windows1252_x80_x9F_to_u8string{
47+ static const std::map<std::u32string::value_type , sys::U8string> Windows1252_x80_x9F_to_u8string{
5148 {0x80 , utf8_ (0x20AC ) } // EURO SIGN
5249 // , {0x81, replacement_character } // UNDEFINED
5350 , {0x82 , utf8_ (0x201A ) } // SINGLE LOW-9 QUOTATION MARK
@@ -89,6 +86,28 @@ static constexpr sys::U8string::value_type cast(uint8_t ch)
8986 static_assert (sizeof (decltype (ch)) == sizeof (sys::U8string::value_type), " sizeof(uint8_t) != sizeof(Char8_t)" );
9087 return static_cast <sys::U8string::value_type>(ch);
9188}
89+
90+ static std::map<std::u32string::value_type, sys::U8string> Windows1252_to_u8string ()
91+ {
92+ auto retval = Windows1252_x80_x9F_to_u8string;
93+
94+ // Add the ISO8859-1 values to the map too. 1) We're already looking
95+ // in the map anyway for Windows-1252 characters. 2) Need map
96+ // entires for conversion from UTF-8 to Windows-1252.
97+ for (uint32_t ch_ = 0xA0 ; ch_ <= 0xff ; ch_++)
98+ {
99+ // ISO8859-1 can be converted to UTF-8 with bit-twiddling
100+ const auto ch = static_cast <uint8_t >(ch_);
101+
102+ // https://stackoverflow.com/questions/4059775/convert-iso-8859-1-strings-to-utf-8-in-c-c
103+ // *out++=0xc2+(*in>0xbf), *out++=(*in++&0x3f)+0x80;
104+ sys::U8string s{cast (0xc2 + (ch > 0xbf )), cast ((ch & 0x3f ) + 0x80 )}; // ISO8859-1
105+ retval[ch_] = std::move (s);
106+ }
107+
108+ return retval;
109+ }
110+
92111static sys::U8string fromWindows1252 (uint8_t ch)
93112{
94113 // ASCII is the same in UTF-8
@@ -97,15 +116,7 @@ static sys::U8string fromWindows1252(uint8_t ch)
97116 return sys::U8string{cast (ch)}; // ASCII
98117 }
99118
100- // ISO8859-1 can be converted to UTF-8 with bit-twiddling
101- if (ch > 0x9F )
102- {
103- // https://stackoverflow.com/questions/4059775/convert-iso-8859-1-strings-to-utf-8-in-c-c
104- // *out++=0xc2+(*in>0xbf), *out++=(*in++&0x3f)+0x80;
105- return sys::U8string{cast (0xc2 + (ch > 0xbf )), cast ((ch & 0x3f ) + 0x80 )}; // ISO8859-1
106- }
107-
108- static const auto map = Windows1252_x80_x9F_to_u8string;
119+ static const auto map = Windows1252_to_u8string ();
109120 const auto it = map.find (ch);
110121 if (it != map.end ())
111122 {
@@ -135,6 +146,79 @@ void str::windows1252to8(W1252string::const_pointer p, size_t sz, sys::U8string&
135146 }
136147}
137148
149+ template <typename TKey, typename TValue>
150+ std::map<TValue, TKey> kv_to_vk (const std::map<TKey, TValue>& kv)
151+ {
152+ std::map<TValue, TKey> retval;
153+ for (const auto & p : kv)
154+ {
155+ retval[p.second ] = p.first ;
156+ }
157+ return retval;
158+ }
159+
160+ // Keeping this "static" for now, don't want to encouarge this converstion. Client
161+ // access is via str::toString().
162+ static void toWindows1252 (str::U8string::const_pointer p, size_t sz, str::W1252string& result)
163+ {
164+ for (size_t i = 0 ; i < sz; i++)
165+ {
166+ // ASCII is the same in UTF-8
167+ if (p[i] < static_cast <str::U8string::value_type>(0x80 ))
168+ {
169+ result += static_cast <str::W1252string::value_type>(p[i]); // ASCII
170+ continue ;
171+ }
172+
173+ constexpr auto invalid = static_cast <str::W1252string::value_type>(0x7F ); // <DEL>
174+ if (!(i + i < sz))
175+ {
176+ // No remaining bytes, invalid UTF-8 encoding
177+ result += invalid;
178+ return ;
179+ }
180+
181+ // https://en.wikipedia.org/wiki/UTF-8
182+ const auto b1 = static_cast <uint8_t >(p[i]);
183+ i++; // move to second byte
184+ if (b1 >= 0xE0 ) // 1110xxxx
185+ {
186+ // not a two-byte sequence, nothing to convert to Windows-1252
187+ result += invalid; // <DEL>
188+
189+ i++; // skip third byte
190+ if (b1 >= 0xF0 ) // 1111xxx
191+ {
192+ i++; // skip fourth byte
193+ }
194+ continue ;
195+ }
196+
197+ const auto b2 = static_cast <uint8_t >(p[i]);
198+ if (b2 < 0x80 ) // 10xxxxxx
199+ {
200+ // invalid second byte
201+ result += invalid; // <DEL>
202+ continue ;
203+ }
204+
205+ const str::U8string utf8{cast (b1), cast (b2)};
206+
207+ static const auto map = kv_to_vk (Windows1252_to_u8string ());
208+ const auto it = map.find (utf8);
209+ if (it != map.end ())
210+ {
211+ result += static_cast <str::W1252string::value_type>(it->second );
212+ }
213+ else
214+ {
215+ // UTF-8 character can't be converted to Windows-1252
216+ result += invalid; // <DEL>
217+ }
218+ }
219+ }
220+
221+
138222struct back_inserter final
139223{
140224 sys::U8string* container = nullptr ; // pointer instead of reference for copy
@@ -173,17 +257,6 @@ void str::utf32to8(std::u32string::const_pointer p, size_t sz, sys::U8string& re
173257 utf8::utf32to8 (p, p + sz, back_inserter (result));
174258}
175259
176- void str::utf8to16 (sys::U8string::const_pointer p, size_t sz, std::u16string& result)
177- {
178- auto p8 = cast<const uint8_t *>(p);
179- utf8::utf8to16 (p8, p8 + sz, std::back_inserter (result));
180- }
181- void str::utf8to32 (sys::U8string::const_pointer p, size_t sz, std::u32string& result)
182- {
183- auto p8 = cast<const uint8_t *>(p);
184- utf8::utf8to32 (p8, p8 + sz, std::back_inserter (result));
185- }
186-
187260inline void wsto8_ (std::u16string::const_pointer begin, std::u16string::const_pointer end, sys::U8string& result)
188261{
189262 utf8::utf16to8 (begin, end, back_inserter (result));
@@ -231,12 +304,9 @@ std::string str::toString(const str::U8string& utf8)
231304 auto platform = details::Platform; // "conditional expression is constant"
232305 if (platform == details::PlatformType::Windows)
233306 {
234- #ifdef _WIN32
235- std::u16string utf16;
236- utf8to16 (utf8, utf16);
237- const _bstr_t s (c_str<std::wstring::const_pointer>(utf16)); // wchar_t is UTF-16 on Windows
238- return static_cast <std::string::const_pointer>(s);
239- #endif
307+ str::W1252string w1252;
308+ toWindows1252 (utf8.c_str (), utf8.length (), w1252);
309+ return c_str<std::string::const_pointer>(w1252); // copy
240310 }
241311 else if (platform == details::PlatformType::Linux)
242312 {
0 commit comments