aboutsummaryrefslogtreecommitdiffstats
path: root/jni/EastAsianWidth/unicode/ucasemap.h
blob: 9f5880cd6a721a4a1a57a1eed23d5e88112633be (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
/*
*******************************************************************************
*
*   Copyright (C) 2005-2007, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  ucasemap.h
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2005may06
*   created by: Markus W. Scherer
*
*   Case mapping service object and functions using it.
*/

#ifndef __UCASEMAP_H__
#define __UCASEMAP_H__

#include "unicode/utypes.h"
#include "unicode/ustring.h"

/**
 * \file
 * \brief C API: Unicode case mapping functions using a UCaseMap service object.
 *
 * The service object takes care of memory allocations, data loading, and setup
 * for the attributes, as usual.
 *
 * Currently, the functionality provided here does not overlap with uchar.h
 * and ustring.h, except for ucasemap_toTitle().
 *
 * ucasemap_utf8XYZ() functions operate directly on UTF-8 strings.
 */

/**
 * UCaseMap is an opaque service object for newer ICU case mapping functions.
 * Older functions did not use a service object.
 * @stable ICU 3.4
 */
struct UCaseMap;
typedef struct UCaseMap UCaseMap; /**< C typedef for struct UCaseMap. @stable ICU 3.4 */

/**
 * Open a UCaseMap service object for a locale and a set of options.
 * The locale ID and options are preprocessed so that functions using the
 * service object need not process them in each call.
 *
 * @param locale ICU locale ID, used for language-dependent
 *               upper-/lower-/title-casing according to the Unicode standard.
 *               Usual semantics: ""=root, NULL=default locale, etc.
 * @param options Options bit set, used for case folding and string comparisons.
 *                Same flags as for u_foldCase(), u_strFoldCase(),
 *                u_strCaseCompare(), etc.
 *                Use 0 or U_FOLD_CASE_DEFAULT for default behavior.
 * @param pErrorCode Must be a valid pointer to an error code value,
 *                   which must not indicate a failure before the function call.
 * @return Pointer to a UCaseMap service object, if successful.
 *
 * @see U_FOLD_CASE_DEFAULT
 * @see U_FOLD_CASE_EXCLUDE_SPECIAL_I
 * @see U_TITLECASE_NO_LOWERCASE
 * @see U_TITLECASE_NO_BREAK_ADJUSTMENT
 * @stable ICU 3.4
 */
U_STABLE UCaseMap * U_EXPORT2
ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode);

/**
 * Close a UCaseMap service object.
 * @param csm Object to be closed.
 * @stable ICU 3.4
 */
U_STABLE void U_EXPORT2
ucasemap_close(UCaseMap *csm);

/**
 * Get the locale ID that is used for language-dependent case mappings.
 * @param csm UCaseMap service object.
 * @return locale ID
 * @stable ICU 3.4
 */
U_STABLE const char * U_EXPORT2
ucasemap_getLocale(const UCaseMap *csm);

/**
 * Get the options bit set that is used for case folding and string comparisons.
 * @param csm UCaseMap service object.
 * @return options bit set
 * @stable ICU 3.4
 */
U_STABLE uint32_t U_EXPORT2
ucasemap_getOptions(const UCaseMap *csm);

/**
 * Set the locale ID that is used for language-dependent case mappings.
 *
 * @param csm UCaseMap service object.
 * @param locale Locale ID, see ucasemap_open().
 * @param pErrorCode Must be a valid pointer to an error code value,
 *                   which must not indicate a failure before the function call.
 *
 * @see ucasemap_open
 * @stable ICU 3.4
 */
U_STABLE void U_EXPORT2
ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode);

/**
 * Set the options bit set that is used for case folding and string comparisons.
 *
 * @param csm UCaseMap service object.
 * @param options Options bit set, see ucasemap_open().
 * @param pErrorCode Must be a valid pointer to an error code value,
 *                   which must not indicate a failure before the function call.
 *
 * @see ucasemap_open
 * @stable ICU 3.4
 */
U_STABLE void U_EXPORT2
ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode);

#ifndef U_HIDE_DRAFT_API

/**
 * Do not lowercase non-initial parts of words when titlecasing.
 * Option bit for titlecasing APIs that take an options bit set.
 *
 * By default, titlecasing will titlecase the first cased character
 * of a word and lowercase all other characters.
 * With this option, the other characters will not be modified.
 *
 * @see ucasemap_setOptions
 * @see ucasemap_toTitle
 * @see ucasemap_utf8ToTitle
 * @see UnicodeString::toTitle
 * @draft ICU 3.8
 */
#define U_TITLECASE_NO_LOWERCASE 0x100

/**
 * Do not adjust the titlecasing indexes from BreakIterator::next() indexes;
 * titlecase exactly the characters at breaks from the iterator.
 * Option bit for titlecasing APIs that take an options bit set.
 *
 * By default, titlecasing will take each break iterator index,
 * adjust it by looking for the next cased character, and titlecase that one.
 * Other characters are lowercased.
 *
 * This follows Unicode 4 & 5 section 3.13 Default Case Operations:
 *
 * R3  toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
 * #29, "Text Boundaries." Between each pair of word boundaries, find the first
 * cased character F. If F exists, map F to default_title(F); then map each
 * subsequent character C to default_lower(C).
 *
 * @see ucasemap_setOptions
 * @see ucasemap_toTitle
 * @see ucasemap_utf8ToTitle
 * @see UnicodeString::toTitle
 * @see U_TITLECASE_NO_LOWERCASE
 * @draft ICU 3.8
 */
#define U_TITLECASE_NO_BREAK_ADJUSTMENT 0x200

#endif

#if !UCONFIG_NO_BREAK_ITERATION

/**
 * Get the break iterator that is used for titlecasing.
 * Do not modify the returned break iterator.
 * @param csm UCaseMap service object.
 * @return titlecasing break iterator
 * @draft ICU 3.8
 */
U_DRAFT const UBreakIterator * U_EXPORT2
ucasemap_getBreakIterator(const UCaseMap *csm);

/**
 * Set the break iterator that is used for titlecasing.
 * The UCaseMap service object releases a previously set break iterator
 * and "adopts" this new one, taking ownership of it.
 * It will be released in a subsequent call to ucasemap_setBreakIterator()
 * or ucasemap_close().
 *
 * Break iterator operations are not thread-safe. Therefore, titlecasing
 * functions use non-const UCaseMap objects. It is not possible to titlecase
 * strings concurrently using the same UCaseMap.
 *
 * @param csm UCaseMap service object.
 * @param iterToAdopt Break iterator to be adopted for titlecasing.
 * @param pErrorCode Must be a valid pointer to an error code value,
 *                   which must not indicate a failure before the function call.
 *
 * @see ucasemap_toTitle
 * @see ucasemap_utf8ToTitle
 * @draft ICU 3.8
 */
U_DRAFT void U_EXPORT2
ucasemap_setBreakIterator(UCaseMap *csm, UBreakIterator *iterToAdopt, UErrorCode *pErrorCode);

/**
 * Titlecase a UTF-16 string. This function is almost a duplicate of u_strToTitle(),
 * except that it takes ucasemap_setOptions() into account and has performance
 * advantages from being able to use a UCaseMap object for multiple case mapping
 * operations, saving setup time.
 *
 * Casing is locale-dependent and context-sensitive.
 * Titlecasing uses a break iterator to find the first characters of words
 * that are to be titlecased. It titlecases those characters and lowercases
 * all others. (This can be modified with ucasemap_setOptions().)
 *
 * The titlecase break iterator can be provided to customize for arbitrary
 * styles, using rules and dictionaries beyond the standard iterators.
 * It may be more efficient to always provide an iterator to avoid
 * opening and closing one for each string.
 * The standard titlecase iterator for the root locale implements the
 * algorithm of Unicode TR 21.
 *
 * This function uses only the setText(), first() and next() methods of the
 * provided break iterator.
 *
 * The result may be longer or shorter than the original.
 * The source string and the destination buffer must not overlap.
 *
 * @param csm       UCaseMap service object.
 * @param dest      A buffer for the result string. The result will be NUL-terminated if
 *                  the buffer is large enough.
 *                  The contents is undefined in case of failure.
 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
 *                  dest may be NULL and the function will only return the length of the result
 *                  without writing any of the result string.
 * @param src       The original string.
 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
 * @param pErrorCode Must be a valid pointer to an error code value,
 *                  which must not indicate a failure before the function call.
 * @return The length of the result string, if successful - or in case of a buffer overflow,
 *         in which case it will be greater than destCapacity.
 *
 * @see u_strToTitle
 * @draft ICU 3.8
 */
U_DRAFT int32_t U_EXPORT2
ucasemap_toTitle(UCaseMap *csm,
                 UChar *dest, int32_t destCapacity,
                 const UChar *src, int32_t srcLength,
                 UErrorCode *pErrorCode);

#endif

/**
 * Lowercase the characters in a UTF-8 string.
 * Casing is locale-dependent and context-sensitive.
 * The result may be longer or shorter than the original.
 * The source string and the destination buffer must not overlap.
 *
 * @param csm       UCaseMap service object.
 * @param dest      A buffer for the result string. The result will be NUL-terminated if
 *                  the buffer is large enough.
 *                  The contents is undefined in case of failure.
 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
 *                  dest may be NULL and the function will only return the length of the result
 *                  without writing any of the result string.
 * @param src       The original string.
 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
 * @param pErrorCode Must be a valid pointer to an error code value,
 *                  which must not indicate a failure before the function call.
 * @return The length of the result string, if successful - or in case of a buffer overflow,
 *         in which case it will be greater than destCapacity.
 *
 * @see u_strToLower
 * @stable ICU 3.4
 */
U_STABLE int32_t U_EXPORT2
ucasemap_utf8ToLower(const UCaseMap *csm,
                     char *dest, int32_t destCapacity,
                     const char *src, int32_t srcLength,
                     UErrorCode *pErrorCode);

/**
 * Uppercase the characters in a UTF-8 string.
 * Casing is locale-dependent and context-sensitive.
 * The result may be longer or shorter than the original.
 * The source string and the destination buffer must not overlap.
 *
 * @param csm       UCaseMap service object.
 * @param dest      A buffer for the result string. The result will be NUL-terminated if
 *                  the buffer is large enough.
 *                  The contents is undefined in case of failure.
 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
 *                  dest may be NULL and the function will only return the length of the result
 *                  without writing any of the result string.
 * @param src       The original string.
 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
 * @param pErrorCode Must be a valid pointer to an error code value,
 *                  which must not indicate a failure before the function call.
 * @return The length of the result string, if successful - or in case of a buffer overflow,
 *         in which case it will be greater than destCapacity.
 *
 * @see u_strToUpper
 * @stable ICU 3.4
 */
U_STABLE int32_t U_EXPORT2
ucasemap_utf8ToUpper(const UCaseMap *csm,
                     char *dest, int32_t destCapacity,
                     const char *src, int32_t srcLength,
                     UErrorCode *pErrorCode);

#if !UCONFIG_NO_BREAK_ITERATION

/**
 * Titlecase a UTF-8 string.
 * Casing is locale-dependent and context-sensitive.
 * Titlecasing uses a break iterator to find the first characters of words
 * that are to be titlecased. It titlecases those characters and lowercases
 * all others. (This can be modified with ucasemap_setOptions().)
 *
 * The titlecase break iterator can be provided to customize for arbitrary
 * styles, using rules and dictionaries beyond the standard iterators.
 * It may be more efficient to always provide an iterator to avoid
 * opening and closing one for each string.
 * The standard titlecase iterator for the root locale implements the
 * algorithm of Unicode TR 21.
 *
 * This function uses only the setText(), first() and next() methods of the
 * provided break iterator.
 *
 * The result may be longer or shorter than the original.
 * The source string and the destination buffer must not overlap.
 *
 * @param csm       UCaseMap service object.
 * @param dest      A buffer for the result string. The result will be NUL-terminated if
 *                  the buffer is large enough.
 *                  The contents is undefined in case of failure.
 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
 *                  dest may be NULL and the function will only return the length of the result
 *                  without writing any of the result string.
 * @param src       The original string.
 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
 * @param pErrorCode Must be a valid pointer to an error code value,
 *                  which must not indicate a failure before the function call.
 * @return The length of the result string, if successful - or in case of a buffer overflow,
 *         in which case it will be greater than destCapacity.
 *
 * @see u_strToTitle
 * @see U_TITLECASE_NO_LOWERCASE
 * @see U_TITLECASE_NO_BREAK_ADJUSTMENT
 * @draft ICU 3.8
 */
U_DRAFT int32_t U_EXPORT2
ucasemap_utf8ToTitle(UCaseMap *csm,
                    char *dest, int32_t destCapacity,
                    const char *src, int32_t srcLength,
                    UErrorCode *pErrorCode);

#endif

/**
 * Case-fold the characters in a UTF-8 string.
 * Case-folding is locale-independent and not context-sensitive,
 * but there is an option for whether to include or exclude mappings for dotted I
 * and dotless i that are marked with 'I' in CaseFolding.txt.
 * The result may be longer or shorter than the original.
 * The source string and the destination buffer must not overlap.
 *
 * @param csm       UCaseMap service object.
 * @param dest      A buffer for the result string. The result will be NUL-terminated if
 *                  the buffer is large enough.
 *                  The contents is undefined in case of failure.
 * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
 *                  dest may be NULL and the function will only return the length of the result
 *                  without writing any of the result string.
 * @param src       The original string.
 * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
 * @param pErrorCode Must be a valid pointer to an error code value,
 *                  which must not indicate a failure before the function call.
 * @return The length of the result string, if successful - or in case of a buffer overflow,
 *         in which case it will be greater than destCapacity.
 *
 * @see u_strFoldCase
 * @see ucasemap_setOptions
 * @see U_FOLD_CASE_DEFAULT
 * @see U_FOLD_CASE_EXCLUDE_SPECIAL_I
 * @draft ICU 3.8
 */
U_DRAFT int32_t U_EXPORT2
ucasemap_utf8FoldCase(const UCaseMap *csm,
                      char *dest, int32_t destCapacity,
                      const char *src, int32_t srcLength,
                      UErrorCode *pErrorCode);

#endif