FreeRDP
Loading...
Searching...
No Matches
unicode_builtin.c
1/*
2 * Copyright 2001-2004 Unicode, Inc.
3 *
4 * Disclaimer
5 *
6 * This source code is provided as is by Unicode, Inc. No claims are
7 * made as to fitness for any particular purpose. No warranties of any
8 * kind are expressed or implied. The recipient agrees to determine
9 * applicability of information provided. If this file has been
10 * purchased on magnetic or optical media from Unicode, Inc., the
11 * sole remedy for any claim will be exchange of defective media
12 * within 90 days of receipt.
13 *
14 * Limitations on Rights to Redistribute This Code
15 *
16 * Unicode, Inc. hereby grants the right to freely use the information
17 * supplied in this file in the creation of products supporting the
18 * Unicode Standard, and to make copies of this file in any form
19 * for internal or external distribution as long as this notice
20 * remains attached.
21 */
22
23/* ---------------------------------------------------------------------
24
25Conversions between UTF32, UTF-16, and UTF-8. Source code file.
26Author: Mark E. Davis, 1994.
27Rev History: Rick McGowan, fixes & updates May 2001.
28Sept 2001: fixed const & error conditions per
29mods suggested by S. Parent & A. Lillich.
30June 2002: Tim Dodd added detection and handling of incomplete
31source sequences, enhanced error detection, added casts
32to eliminate compiler warnings.
33July 2003: slight mods to back out aggressive FFFE detection.
34Jan 2004: updated switches in from-UTF8 conversions.
35Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
36
37See the header file "utf.h" for complete documentation.
38
39------------------------------------------------------------------------ */
40
41#include <winpr/wtypes.h>
42#include <winpr/string.h>
43#include <winpr/assert.h>
44#include <winpr/cast.h>
45
46#include "unicode.h"
47
48#include "../log.h"
49#define TAG WINPR_TAG("unicode")
50
51/*
52 * Character Types:
53 *
54 * UTF8: uint8_t 8 bits
55 * UTF16: uint16_t 16 bits
56 * UTF32: uint32_t 32 bits
57 */
58
59/* Some fundamental constants */
60#define UNI_REPLACEMENT_CHAR (uint32_t)0x0000FFFD
61#define UNI_MAX_BMP (uint32_t)0x0000FFFF
62#define UNI_MAX_UTF16 (uint32_t)0x0010FFFF
63#define UNI_MAX_UTF32 (uint32_t)0x7FFFFFFF
64#define UNI_MAX_LEGAL_UTF32 (uint32_t)0x0010FFFF
65
66typedef enum
67{
68 conversionOK, /* conversion successful */
69 sourceExhausted, /* partial character in source, but hit end */
70 targetExhausted, /* insuff. room in target for conversion */
71 sourceIllegal /* source sequence is illegal/malformed */
72} ConversionResult;
73
74typedef enum
75{
76 strictConversion = 0,
77 lenientConversion
78} ConversionFlags;
79
80static const int halfShift = 10; /* used for shifting by 10 bits */
81
82static const uint32_t halfBase = 0x0010000UL;
83static const uint32_t halfMask = 0x3FFUL;
84
85#define UNI_SUR_HIGH_START (uint32_t)0xD800
86#define UNI_SUR_HIGH_END (uint32_t)0xDBFF
87#define UNI_SUR_LOW_START (uint32_t)0xDC00
88#define UNI_SUR_LOW_END (uint32_t)0xDFFF
89
90/* --------------------------------------------------------------------- */
91
92/*
93 * Index into the table below with the first byte of a UTF-8 sequence to
94 * get the number of trailing bytes that are supposed to follow it.
95 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
96 * left as-is for anyone who may want to do such conversion, which was
97 * allowed in earlier algorithms.
98 */
99static const char trailingBytesForUTF8[256] = {
100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
106 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
107 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
108};
109
110/*
111 * Magic values subtracted from a buffer value during UTF8 conversion.
112 * This table contains as many values as there might be trailing bytes
113 * in a UTF-8 sequence.
114 */
115static const uint32_t offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
116 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
117
118/*
119 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
120 * into the first byte, depending on how many bytes follow. There are
121 * as many entries in this table as there are UTF-8 sequence types.
122 * (I.e., one byte sequence, two byte... etc.). Remember that sequence
123 * for *legal* UTF-8 will be 4 or fewer bytes total.
124 */
125static const uint8_t firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
126
127/* --------------------------------------------------------------------- */
128
129/* The interface converts a whole buffer to avoid function-call overhead.
130 * Constants have been gathered. Loops & conditionals have been removed as
131 * much as possible for efficiency, in favor of drop-through switches.
132 * (See "Note A" at the bottom of the file for equivalent code.)
133 * If your compiler supports it, the "isLegalUTF8" call can be turned
134 * into an inline function.
135 */
136
137/* --------------------------------------------------------------------- */
138
139static ConversionResult winpr_ConvertUTF16toUTF8_Internal(const uint16_t** sourceStart,
140 const uint16_t* sourceEnd,
141 uint8_t** targetStart, uint8_t* targetEnd,
142 ConversionFlags flags)
143{
144 bool computeLength = (!targetEnd) ? true : false;
145 const uint16_t* source = *sourceStart;
146 uint8_t* target = *targetStart;
147 ConversionResult result = conversionOK;
148
149 while (source < sourceEnd)
150 {
151 uint32_t ch = 0;
152 unsigned short bytesToWrite = 0;
153 const uint32_t byteMask = 0xBF;
154 const uint32_t byteMark = 0x80;
155 const uint16_t* oldSource =
156 source; /* In case we have to back up because of target overflow. */
157
158 ch = *source++;
159
160 /* If we have a surrogate pair, convert to UTF32 first. */
161 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
162 {
163 /* If the 16 bits following the high surrogate are in the source buffer... */
164 if (source < sourceEnd)
165 {
166 uint32_t ch2 = *source;
167
168 /* If it's a low surrogate, convert to UTF32. */
169 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
170 {
171 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + (ch2 - UNI_SUR_LOW_START) +
172 halfBase;
173 ++source;
174 }
175 else if (flags == strictConversion)
176 {
177 /* it's an unpaired high surrogate */
178 --source; /* return to the illegal value itself */
179 result = sourceIllegal;
180 break;
181 }
182 }
183 else
184 {
185 /* We don't have the 16 bits following the high surrogate. */
186 --source; /* return to the high surrogate */
187 result = sourceExhausted;
188 break;
189 }
190 }
191 else if (flags == strictConversion)
192 {
193 /* UTF-16 surrogate values are illegal in UTF-32 */
194 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
195 {
196 --source; /* return to the illegal value itself */
197 result = sourceIllegal;
198 break;
199 }
200 }
201
202 /* Figure out how many bytes the result will require */
203 if (ch < (uint32_t)0x80)
204 {
205 bytesToWrite = 1;
206 }
207 else if (ch < (uint32_t)0x800)
208 {
209 bytesToWrite = 2;
210 }
211 else if (ch < (uint32_t)0x10000)
212 {
213 bytesToWrite = 3;
214 }
215 else if (ch < (uint32_t)0x110000)
216 {
217 bytesToWrite = 4;
218 }
219 else
220 {
221 bytesToWrite = 3;
222 ch = UNI_REPLACEMENT_CHAR;
223 }
224
225 target += bytesToWrite;
226
227 if ((target > targetEnd) && (!computeLength))
228 {
229 source = oldSource; /* Back up source pointer! */
230 target -= bytesToWrite;
231 result = targetExhausted;
232 break;
233 }
234
235 if (!computeLength)
236 {
237 switch (bytesToWrite)
238 {
239 /* note: everything falls through. */
240 case 4:
241 *--target = (uint8_t)((ch | byteMark) & byteMask);
242 ch >>= 6;
243 /* fallthrough */
244 WINPR_FALLTHROUGH
245 case 3:
246 *--target = (uint8_t)((ch | byteMark) & byteMask);
247 ch >>= 6;
248 /* fallthrough */
249 WINPR_FALLTHROUGH
250
251 case 2:
252 *--target = (uint8_t)((ch | byteMark) & byteMask);
253 ch >>= 6;
254 /* fallthrough */
255 WINPR_FALLTHROUGH
256
257 case 1:
258 *--target = (uint8_t)(ch | firstByteMark[bytesToWrite]);
259 }
260 }
261 else
262 {
263 switch (bytesToWrite)
264 {
265 /* note: everything falls through. */
266 case 4:
267 --target;
268 /* fallthrough */
269 WINPR_FALLTHROUGH
270
271 case 3:
272 --target;
273 /* fallthrough */
274 WINPR_FALLTHROUGH
275
276 case 2:
277 --target;
278 /* fallthrough */
279 WINPR_FALLTHROUGH
280
281 case 1:
282 --target;
283 }
284 }
285
286 target += bytesToWrite;
287 }
288
289 *sourceStart = source;
290 *targetStart = target;
291 return result;
292}
293
294/* --------------------------------------------------------------------- */
295
296/*
297 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
298 * This must be called with the length pre-determined by the first byte.
299 * If not calling this from ConvertUTF8to*, then the length can be set by:
300 * length = trailingBytesForUTF8[*source]+1;
301 * and the sequence is illegal right away if there aren't that many bytes
302 * available.
303 * If presented with a length > 4, this returns false. The Unicode
304 * definition of UTF-8 goes up to 4-byte sequences.
305 */
306
307static bool isLegalUTF8(const uint8_t* source, int length)
308{
309 uint8_t a = 0;
310 const uint8_t* srcptr = source + length;
311
312 switch (length)
313 {
314 default:
315 return false;
316
317 /* Everything else falls through when "true"... */
318 case 4:
319 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
320 return false;
321 /* fallthrough */
322 WINPR_FALLTHROUGH
323
324 case 3:
325 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
326 return false;
327 /* fallthrough */
328 WINPR_FALLTHROUGH
329
330 case 2:
331 if ((a = (*--srcptr)) > 0xBF)
332 return false;
333
334 switch (*source)
335 {
336 /* no fall-through in this inner switch */
337 case 0xE0:
338 if (a < 0xA0)
339 return false;
340
341 break;
342
343 case 0xED:
344 if (a > 0x9F)
345 return false;
346
347 break;
348
349 case 0xF0:
350 if (a < 0x90)
351 return false;
352
353 break;
354
355 case 0xF4:
356 if (a > 0x8F)
357 return false;
358
359 break;
360
361 default:
362 if (a < 0x80)
363 return false;
364 break;
365 }
366 /* fallthrough */
367 WINPR_FALLTHROUGH
368
369 case 1:
370 if (*source >= 0x80 && *source < 0xC2)
371 return false;
372 }
373
374 if (*source > 0xF4)
375 return false;
376
377 return true;
378}
379
380/* --------------------------------------------------------------------- */
381
382static ConversionResult winpr_ConvertUTF8toUTF16_Internal(const uint8_t** sourceStart,
383 const uint8_t* sourceEnd,
384 uint16_t** targetStart,
385 uint16_t* targetEnd,
386 ConversionFlags flags)
387{
388 bool computeLength = (!targetEnd) ? true : false;
389 ConversionResult result = conversionOK;
390 const uint8_t* source = *sourceStart;
391 uint16_t* target = *targetStart;
392
393 while (source < sourceEnd)
394 {
395 uint32_t ch = 0;
396 unsigned short extraBytesToRead =
397 WINPR_ASSERTING_INT_CAST(unsigned short, trailingBytesForUTF8[*source]);
398
399 if ((source + extraBytesToRead) >= sourceEnd)
400 {
401 result = sourceExhausted;
402 break;
403 }
404
405 /* Do this check whether lenient or strict */
406 if (!isLegalUTF8(source, extraBytesToRead + 1))
407 {
408 result = sourceIllegal;
409 break;
410 }
411
412 /*
413 * The cases all fall through. See "Note A" below.
414 */
415 switch (extraBytesToRead)
416 {
417 case 5:
418 ch += *source++;
419 ch <<= 6; /* remember, illegal UTF-8 */
420 /* fallthrough */
421 WINPR_FALLTHROUGH
422
423 case 4:
424 ch += *source++;
425 ch <<= 6; /* remember, illegal UTF-8 */
426 /* fallthrough */
427 WINPR_FALLTHROUGH
428
429 case 3:
430 ch += *source++;
431 ch <<= 6;
432 /* fallthrough */
433 WINPR_FALLTHROUGH
434
435 case 2:
436 ch += *source++;
437 ch <<= 6;
438 /* fallthrough */
439 WINPR_FALLTHROUGH
440
441 case 1:
442 ch += *source++;
443 ch <<= 6;
444 /* fallthrough */
445 WINPR_FALLTHROUGH
446
447 case 0:
448 ch += *source++;
449 }
450
451 ch -= offsetsFromUTF8[extraBytesToRead];
452
453 if ((target >= targetEnd) && (!computeLength))
454 {
455 source -= (extraBytesToRead + 1); /* Back up source pointer! */
456 result = targetExhausted;
457 break;
458 }
459
460 if (ch <= UNI_MAX_BMP)
461 {
462 /* Target is a character <= 0xFFFF */
463 /* UTF-16 surrogate values are illegal in UTF-32 */
464 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END)
465 {
466 if (flags == strictConversion)
467 {
468 source -= (extraBytesToRead + 1); /* return to the illegal value itself */
469 result = sourceIllegal;
470 break;
471 }
472 else
473 {
474 if (!computeLength)
475 *target++ = UNI_REPLACEMENT_CHAR;
476 else
477 target++;
478 }
479 }
480 else
481 {
482 if (!computeLength)
483 *target++ = (uint16_t)ch; /* normal case */
484 else
485 target++;
486 }
487 }
488 else if (ch > UNI_MAX_UTF16)
489 {
490 if (flags == strictConversion)
491 {
492 result = sourceIllegal;
493 source -= (extraBytesToRead + 1); /* return to the start */
494 break; /* Bail out; shouldn't continue */
495 }
496 else
497 {
498 if (!computeLength)
499 *target++ = UNI_REPLACEMENT_CHAR;
500 else
501 target++;
502 }
503 }
504 else
505 {
506 /* target is a character in range 0xFFFF - 0x10FFFF. */
507 if ((target + 1 >= targetEnd) && (!computeLength))
508 {
509 source -= (extraBytesToRead + 1); /* Back up source pointer! */
510 result = targetExhausted;
511 break;
512 }
513
514 ch -= halfBase;
515
516 if (!computeLength)
517 {
518 *target++ = (uint16_t)((ch >> halfShift) + UNI_SUR_HIGH_START);
519 *target++ = (uint16_t)((ch & halfMask) + UNI_SUR_LOW_START);
520 }
521 else
522 {
523 target++;
524 target++;
525 }
526 }
527 }
528
529 *sourceStart = source;
530 *targetStart = target;
531 return result;
532}
533
538static int winpr_ConvertUTF8toUTF16(const uint8_t* src, int cchSrc, uint16_t* dst, int cchDst)
539{
540 size_t length = 0;
541 uint16_t* dstBeg = NULL;
542 uint16_t* dstEnd = NULL;
543 const uint8_t* srcBeg = NULL;
544 const uint8_t* srcEnd = NULL;
545 ConversionResult result = sourceIllegal;
546
547 if (cchSrc == -1)
548 cchSrc = (int)strnlen((const char*)src, INT32_MAX - 1) + 1;
549
550 srcBeg = src;
551 srcEnd = &src[cchSrc];
552
553 if (cchDst == 0)
554 {
555 result =
556 winpr_ConvertUTF8toUTF16_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
557
558 length = dstBeg - (uint16_t*)NULL;
559 }
560 else
561 {
562 dstBeg = dst;
563 dstEnd = &dst[cchDst];
564
565 result =
566 winpr_ConvertUTF8toUTF16_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
567
568 length = dstBeg - dst;
569 }
570
571 if (result == targetExhausted)
572 {
573 SetLastError(ERROR_INSUFFICIENT_BUFFER);
574 return 0;
575 }
576
577 return (result == conversionOK) ? WINPR_ASSERTING_INT_CAST(int, length) : 0;
578}
579
580static int winpr_ConvertUTF16toUTF8(const uint16_t* src, int cchSrc, uint8_t* dst, int cchDst)
581{
582 size_t length = 0;
583 uint8_t* dstBeg = NULL;
584 uint8_t* dstEnd = NULL;
585 const uint16_t* srcBeg = NULL;
586 const uint16_t* srcEnd = NULL;
587 ConversionResult result = sourceIllegal;
588
589 if (cchSrc == -1)
590 cchSrc = (int)_wcsnlen((const WCHAR*)src, INT32_MAX - 1) + 1;
591
592 srcBeg = src;
593 srcEnd = &src[cchSrc];
594
595 if (cchDst == 0)
596 {
597 result =
598 winpr_ConvertUTF16toUTF8_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
599
600 length = dstBeg - ((uint8_t*)NULL);
601 }
602 else
603 {
604 dstBeg = dst;
605 dstEnd = &dst[cchDst];
606
607 result =
608 winpr_ConvertUTF16toUTF8_Internal(&srcBeg, srcEnd, &dstBeg, dstEnd, strictConversion);
609
610 length = dstBeg - dst;
611 }
612
613 if (result == targetExhausted)
614 {
615 SetLastError(ERROR_INSUFFICIENT_BUFFER);
616 return 0;
617 }
618
619 return (result == conversionOK) ? WINPR_ASSERTING_INT_CAST(int, length) : 0;
620}
621
622/* --------------------------------------------------------------------- */
623
624int int_MultiByteToWideChar(UINT CodePage, DWORD dwFlags, LPCSTR lpMultiByteStr, int cbMultiByte,
625 LPWSTR lpWideCharStr, int cchWideChar)
626{
627 size_t cbCharLen = (size_t)cbMultiByte;
628
629 WINPR_UNUSED(dwFlags);
630
631 /* If cbMultiByte is 0, the function fails */
632 if ((cbMultiByte == 0) || (cbMultiByte < -1))
633 return 0;
634
635 if (cchWideChar < 0)
636 return -1;
637
638 if (cbMultiByte < 0)
639 {
640 const size_t len = strlen(lpMultiByteStr);
641 if (len >= INT32_MAX)
642 return 0;
643 cbCharLen = (int)len + 1;
644 }
645 else
646 cbCharLen = cbMultiByte;
647
648 WINPR_ASSERT(lpMultiByteStr);
649 switch (CodePage)
650 {
651 case CP_ACP:
652 case CP_UTF8:
653 break;
654
655 default:
656 WLog_ERR(TAG, "Unsupported encoding %u", CodePage);
657 return 0;
658 }
659
660 return winpr_ConvertUTF8toUTF16((const uint8_t*)lpMultiByteStr,
661 WINPR_ASSERTING_INT_CAST(int, cbCharLen),
662 (uint16_t*)lpWideCharStr, cchWideChar);
663}
664
665int int_WideCharToMultiByte(UINT CodePage, DWORD dwFlags, LPCWSTR lpWideCharStr, int cchWideChar,
666 LPSTR lpMultiByteStr, int cbMultiByte, LPCSTR lpDefaultChar,
667 LPBOOL lpUsedDefaultChar)
668{
669 size_t cbCharLen = (size_t)cchWideChar;
670
671 WINPR_UNUSED(dwFlags);
672 /* If cchWideChar is 0, the function fails */
673 if ((cchWideChar == 0) || (cchWideChar < -1))
674 return 0;
675
676 if (cbMultiByte < 0)
677 return -1;
678
679 WINPR_ASSERT(lpWideCharStr);
680 /* If cchWideChar is -1, the string is null-terminated */
681 if (cchWideChar == -1)
682 {
683 const size_t len = _wcslen(lpWideCharStr);
684 if (len >= INT32_MAX)
685 return 0;
686 cbCharLen = (int)len + 1;
687 }
688 else
689 cbCharLen = cchWideChar;
690
691 /*
692 * if cbMultiByte is 0, the function returns the required buffer size
693 * in bytes for lpMultiByteStr and makes no use of the output parameter itself.
694 */
695
696 return winpr_ConvertUTF16toUTF8((const uint16_t*)lpWideCharStr,
697 WINPR_ASSERTING_INT_CAST(int, cbCharLen),
698 (uint8_t*)lpMultiByteStr, cbMultiByte);
699}