1
/* -*- Mode: c; c-basic-offset: 4; indent-tabs-mode: t; tab-width: 8; -*- */
2
/* cairo - a vector graphics library with display and print output
3
 *
4
 * The code in this file is derived from GLib's gutf8.c and
5
 *   ultimately from libunicode. It is relicensed under the
6
 *   dual LGPL/MPL with permission of the original authors.
7
 *
8
 * Copyright © 1999 Tom Tromey
9
 * Copyright © 2005 Red Hat, Inc
10
 *
11
 * This library is free software; you can redistribute it and/or
12
 * modify it either under the terms of the GNU Lesser General Public
13
 * License version 2.1 as published by the Free Software Foundation
14
 * (the "LGPL") or, at your option, under the terms of the Mozilla
15
 * Public License Version 1.1 (the "MPL"). If you do not alter this
16
 * notice, a recipient may use your version of this file under either
17
 * the MPL or the LGPL.
18
 *
19
 * You should have received a copy of the LGPL along with this library
20
 * in the file COPYING-LGPL-2.1; if not, write to the Free Software
21
 * Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335, USA
22
 * You should have received a copy of the MPL along with this library
23
 * in the file COPYING-MPL-1.1
24
 *
25
 * The contents of this file are subject to the Mozilla Public License
26
 * Version 1.1 (the "License"); you may not use this file except in
27
 * compliance with the License. You may obtain a copy of the License at
28
 * http://www.mozilla.org/MPL/
29
 *
30
 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY
31
 * OF ANY KIND, either express or implied. See the LGPL or the MPL for
32
 * the specific language governing rights and limitations.
33
 *
34
 * The Original Code is the cairo graphics library.
35
 *
36
 * The Initial Developer of the Original Code is Tom Tromey.
37
 *  and Red Hat, Inc.
38
 *
39
 * Contributor(s):
40
 *	Owen Taylor <otaylor@redhat.com>
41
 */
42

            
43
#include "cairoint.h"
44
#include "cairo-error-private.h"
45

            
46
#define UTF8_COMPUTE(Char, Mask, Len)					      \
47
  if (Char < 128)							      \
48
    {									      \
49
      Len = 1;								      \
50
      Mask = 0x7f;							      \
51
    }									      \
52
  else if ((Char & 0xe0) == 0xc0)					      \
53
    {									      \
54
      Len = 2;								      \
55
      Mask = 0x1f;							      \
56
    }									      \
57
  else if ((Char & 0xf0) == 0xe0)					      \
58
    {									      \
59
      Len = 3;								      \
60
      Mask = 0x0f;							      \
61
    }									      \
62
  else if ((Char & 0xf8) == 0xf0)					      \
63
    {									      \
64
      Len = 4;								      \
65
      Mask = 0x07;							      \
66
    }									      \
67
  else if ((Char & 0xfc) == 0xf8)					      \
68
    {									      \
69
      Len = 5;								      \
70
      Mask = 0x03;							      \
71
    }									      \
72
  else if ((Char & 0xfe) == 0xfc)					      \
73
    {									      \
74
      Len = 6;								      \
75
      Mask = 0x01;							      \
76
    }									      \
77
  else									      \
78
    Len = -1;
79

            
80
#define UTF8_LENGTH(Char)              \
81
  ((Char) < 0x80 ? 1 :                 \
82
   ((Char) < 0x800 ? 2 :               \
83
    ((Char) < 0x10000 ? 3 :            \
84
     ((Char) < 0x200000 ? 4 :          \
85
      ((Char) < 0x4000000 ? 5 : 6)))))
86

            
87
#define UTF8_GET(Result, Chars, Count, Mask, Len)			      \
88
  (Result) = (Chars)[0] & (Mask);					      \
89
  for ((Count) = 1; (Count) < (Len); ++(Count))				      \
90
    {									      \
91
      if (((Chars)[(Count)] & 0xc0) != 0x80)				      \
92
	{								      \
93
	  (Result) = -1;						      \
94
	  break;							      \
95
	}								      \
96
      (Result) <<= 6;							      \
97
      (Result) |= ((Chars)[(Count)] & 0x3f);				      \
98
    }
99

            
100
#define UNICODE_VALID(Char)                   \
101
    ((Char) < 0x110000 &&                     \
102
     (((Char) & 0xFFFFF800) != 0xD800))
103

            
104
static const char utf8_skip_data[256] = {
105
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
106
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
107
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
108
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
109
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
110
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
111
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
112
    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
113
};
114

            
115
#define UTF8_NEXT_CHAR(p) ((p) + utf8_skip_data[*(unsigned char *)(p)])
116

            
117
/* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
118
 * If @p does not point to a valid UTF-8 encoded character, results are
119
 * undefined.
120
 **/
121
static uint32_t
122
_utf8_get_char (const unsigned char *p)
123
{
124
    int i, mask = 0, len;
125
    uint32_t result;
126
    unsigned char c = (unsigned char) *p;
127

            
128
    UTF8_COMPUTE (c, mask, len);
129
    if (len == -1)
130
	return (uint32_t)-1;
131
    UTF8_GET (result, p, i, mask, len);
132

            
133
    return result;
134
}
135

            
136
/* Like _utf8_get_char, but take a maximum length
137
 * and return (uint32_t)-2 on incomplete trailing character
138
 */
139
static uint32_t
140
392488
_utf8_get_char_extended (const unsigned char *p,
141
			 long		      max_len)
142
{
143
    int i, len;
144
392488
    uint32_t wc = (unsigned char) *p;
145

            
146
392488
    if (wc < 0x80) {
147
392472
	return wc;
148
16
    } else if (wc < 0xc0) {
149
	return (uint32_t)-1;
150
16
    } else if (wc < 0xe0) {
151
6
	len = 2;
152
6
	wc &= 0x1f;
153
10
    } else if (wc < 0xf0) {
154
	len = 3;
155
	wc &= 0x0f;
156
10
    } else if (wc < 0xf8) {
157
9
	len = 4;
158
9
	wc &= 0x07;
159
1
    } else if (wc < 0xfc) {
160
	len = 5;
161
	wc &= 0x03;
162
1
    } else if (wc < 0xfe) {
163
	len = 6;
164
	wc &= 0x01;
165
    } else {
166
1
	return (uint32_t)-1;
167
    }
168

            
169
15
    if (max_len >= 0 && len > max_len) {
170
	for (i = 1; i < max_len; i++) {
171
	    if ((((unsigned char *)p)[i] & 0xc0) != 0x80)
172
		return (uint32_t)-1;
173
	}
174
	return (uint32_t)-2;
175
    }
176

            
177
48
    for (i = 1; i < len; ++i) {
178
33
	uint32_t ch = ((unsigned char *)p)[i];
179

            
180
33
	if ((ch & 0xc0) != 0x80) {
181
	    if (ch)
182
		return (uint32_t)-1;
183
	    else
184
		return (uint32_t)-2;
185
	}
186

            
187
33
	wc <<= 6;
188
33
	wc |= (ch & 0x3f);
189
    }
190

            
191
15
    if (UTF8_LENGTH(wc) != len)
192
	return (uint32_t)-1;
193

            
194
15
    return wc;
195
}
196

            
197
/**
198
 * _cairo_utf8_get_char_validated:
199
 * @p: a UTF-8 string
200
 * @unicode: location to store one Unicode character
201
 *
202
 * Decodes the first character of a valid UTF-8 string, and returns
203
 * the number of bytes consumed.
204
 *
205
 * Note that the string should be valid.  Do not use this without
206
 * validating the string first.
207
 *
208
 * Returns: the number of bytes forming the character returned.
209
 **/
210
int
211
356964
_cairo_utf8_get_char_validated (const char *p,
212
				uint32_t   *unicode)
213
{
214
356964
    int i, mask = 0, len;
215
    uint32_t result;
216
356964
    unsigned char c = (unsigned char) *p;
217

            
218
356964
    UTF8_COMPUTE (c, mask, len);
219
356964
    if (len == -1) {
220
	if (unicode)
221
	    *unicode = (uint32_t)-1;
222
	return 1;
223
    }
224
356997
    UTF8_GET (result, p, i, mask, len);
225

            
226
356964
    if (unicode)
227
356964
	*unicode = result;
228
356964
    return len;
229
}
230

            
231
/**
232
 * _cairo_utf8_to_ucs4:
233
 * @str: an UTF-8 string
234
 * @len: length of @str in bytes, or -1 if it is nul-terminated.
235
 *   If @len is supplied and the string has an embedded nul
236
 *   byte, only the portion before the nul byte is converted.
237
 * @result: location to store a pointer to a newly allocated UTF-32
238
 *   string (always native endian), or %NULL. Free with free(). A 0
239
 *   word will be written after the last character.
240
 * @items_written: location to store number of 32-bit words
241
 *   written. (Not including the trailing 0)
242
 *
243
 * Converts a UTF-8 string to UCS-4. UCS-4 is an encoding of Unicode
244
 * with 1 32-bit word per character. The string is validated to
245
 * consist entirely of valid Unicode characters.
246
 *
247
 * Return value: %CAIRO_STATUS_SUCCESS if the entire string was
248
 *   successfully converted. %CAIRO_STATUS_INVALID_STRING if an
249
 *   invalid sequence was found.
250
 **/
251
cairo_status_t
252
71216
_cairo_utf8_to_ucs4 (const char *str,
253
		     int	 len,
254
		     uint32_t  **result,
255
		     int	*items_written)
256
{
257
71216
    uint32_t *str32 = NULL;
258
    int n_chars, i;
259
    const unsigned char *in;
260
71216
    const unsigned char * const ustr = (const unsigned char *) str;
261

            
262
71216
    in = ustr;
263
71216
    n_chars = 0;
264
463703
    while ((len < 0 || ustr + len - in > 0) && *in)
265
    {
266
392488
	uint32_t wc = _utf8_get_char_extended (in, ustr + len - in);
267
392488
	if (wc & 0x80000000 || !UNICODE_VALID (wc))
268
1
	    return _cairo_error (CAIRO_STATUS_INVALID_STRING);
269

            
270
392487
	n_chars++;
271
392487
	if (n_chars == INT_MAX)
272
	    return _cairo_error (CAIRO_STATUS_INVALID_STRING);
273

            
274
392487
	in = UTF8_NEXT_CHAR (in);
275
    }
276

            
277
71215
    if (result) {
278
	str32 = _cairo_malloc_ab (n_chars + 1, sizeof (uint32_t));
279
	if (!str32)
280
	    return _cairo_error (CAIRO_STATUS_NO_MEMORY);
281

            
282
	in = ustr;
283
	for (i=0; i < n_chars; i++) {
284
	    str32[i] = _utf8_get_char (in);
285
	    in = UTF8_NEXT_CHAR (in);
286
	}
287
	str32[i] = 0;
288

            
289
	*result = str32;
290
    }
291

            
292
71215
    if (items_written)
293
67995
	*items_written = n_chars;
294

            
295
71215
    return CAIRO_STATUS_SUCCESS;
296
}
297

            
298
/**
299
 * _cairo_ucs4_to_utf8:
300
 * @unicode: a UCS-4 character
301
 * @utf8: buffer to write utf8 string into. Must have at least 4 bytes
302
 * space available. Or %NULL.
303
 *
304
 * This space left intentionally blank.
305
 *
306
 * Return value: Number of bytes in the utf8 string or 0 if an invalid
307
 * unicode character
308
 **/
309
int
310
_cairo_ucs4_to_utf8 (uint32_t  unicode,
311
		     char     *utf8)
312
{
313
    int bytes;
314
    char *p;
315

            
316
    if (unicode < 0x80) {
317
	if (utf8)
318
	    *utf8 = unicode;
319
	return 1;
320
    } else if (unicode < 0x800) {
321
	bytes = 2;
322
    } else if (unicode < 0x10000) {
323
	bytes = 3;
324
    } else if (unicode < 0x200000) {
325
	bytes = 4;
326
    } else {
327
	return 0;
328
    }
329

            
330
    if (!utf8)
331
	return bytes;
332

            
333
    p = utf8 + bytes;
334
    while (p > utf8) {
335
	*--p = 0x80 | (unicode & 0x3f);
336
	unicode >>= 6;
337
    }
338
    *p |= 0xf0 << (4 - bytes);
339

            
340
    return bytes;
341
}
342

            
343
/**
344
 * _cairo_ucs4_to_utf16:
345
 * @unicode: a UCS-4 character
346
 * @utf16: buffer to write utf16 string into. Must have at least 2
347
 * elements. Or %NULL.
348
 *
349
 * This space left intentionally blank.
350
 *
351
 * Return value: Number of elements in the utf16 string or 0 if an
352
 * invalid unicode character
353
 **/
354
int
355
_cairo_ucs4_to_utf16 (uint32_t  unicode,
356
		      uint16_t *utf16)
357
{
358
    if (unicode < 0x10000) {
359
	if (utf16)
360
	    utf16[0] = unicode;
361
	return 1;
362
    } else if (unicode < 0x110000) {
363
	if (utf16) {
364
	    utf16[0] = (unicode - 0x10000) / 0x400 + 0xd800;
365
	    utf16[1] = (unicode - 0x10000) % 0x400 + 0xdc00;
366
	}
367
	return 2;
368
    } else {
369
	return 0;
370
    }
371
}
372

            
373
#if CAIRO_HAS_UTF8_TO_UTF16
374
/**
375
 * _cairo_utf8_to_utf16:
376
 * @str: an UTF-8 string
377
 * @len: length of @str in bytes, or -1 if it is nul-terminated.
378
 *   If @len is supplied and the string has an embedded nul
379
 *   byte, only the portion before the nul byte is converted.
380
 * @result: location to store a pointer to a newly allocated UTF-16
381
 *   string (always native endian). Free with free(). A 0
382
 *   word will be written after the last character.
383
 * @items_written: location to store number of 16-bit words
384
 *   written. (Not including the trailing 0)
385
 *
386
 * Converts a UTF-8 string to UTF-16. UTF-16 is an encoding of Unicode
387
 * where characters are represented either as a single 16-bit word, or
388
 * as a pair of 16-bit "surrogates". The string is validated to
389
 * consist entirely of valid Unicode characters.
390
 *
391
 * Return value: %CAIRO_STATUS_SUCCESS if the entire string was
392
 *   successfully converted. %CAIRO_STATUS_INVALID_STRING if an
393
 *   an invalid sequence was found.
394
 **/
395
cairo_status_t
396
_cairo_utf8_to_utf16 (const char *str,
397
		      int	  len,
398
		      uint16_t **result,
399
		      int	*items_written)
400
{
401
    uint16_t *str16 = NULL;
402
    int n16, i;
403
    const unsigned char *in;
404
    const unsigned char * const ustr = (const unsigned char *) str;
405

            
406
    in = ustr;
407
    n16 = 0;
408
    while ((len < 0 || ustr + len - in > 0) && *in) {
409
	uint32_t wc = _utf8_get_char_extended (in, ustr + len - in);
410
	if (wc & 0x80000000 || !UNICODE_VALID (wc))
411
	    return _cairo_error (CAIRO_STATUS_INVALID_STRING);
412

            
413
	if (wc < 0x10000)
414
	    n16 += 1;
415
	else
416
	    n16 += 2;
417

            
418
	if (n16 == INT_MAX - 1 || n16 == INT_MAX)
419
	    return _cairo_error (CAIRO_STATUS_INVALID_STRING);
420

            
421
	in = UTF8_NEXT_CHAR (in);
422
    }
423

            
424
    str16 = _cairo_malloc_ab (n16 + 1, sizeof (uint16_t));
425
    if (!str16)
426
	return _cairo_error (CAIRO_STATUS_NO_MEMORY);
427

            
428
    in = ustr;
429
    for (i = 0; i < n16;) {
430
	uint32_t wc = _utf8_get_char (in);
431

            
432
	i += _cairo_ucs4_to_utf16 (wc, str16 + i);
433

            
434
	in = UTF8_NEXT_CHAR (in);
435
    }
436

            
437
    str16[i] = 0;
438

            
439
    *result = str16;
440
    if (items_written)
441
	*items_written = n16;
442

            
443
    return CAIRO_STATUS_SUCCESS;
444
}
445
#endif