00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037 #include "cvtutf.h"
00038
00039
00040
00041 static const int halfShift = 10;
00042 static const UCS4 halfBase = 0x0010000UL;
00043 static const UCS4 halfMask = 0x3FFUL;
00044 static const UCS4 kSurrogateHighStart = 0xD800UL;
00045 static const UCS4 kSurrogateHighEnd = 0xDBFFUL;
00046 static const UCS4 kSurrogateLowStart = 0xDC00UL;
00047 static const UCS4 kSurrogateLowEnd = 0xDFFFUL;
00048
00049
00050
00051 ConversionResult
00052 ConvertUCS4toUTF16(UCS4** sourceStart, const UCS4* sourceEnd,
00053 UTF16** targetStart, const UTF16* targetEnd)
00054 {
00055 ConversionResult result = ok;
00056 register UCS4* source = *sourceStart;
00057 register UTF16* target = *targetStart;
00058 while (source < sourceEnd) {
00059 register UCS4 ch;
00060 if (target >= targetEnd) {
00061 result = targetExhausted; break;
00062 };
00063 ch = *source++;
00064 if (ch <= kMaximumUCS2) {
00065 *target++ = ch;
00066 } else if (ch > kMaximumUTF16) {
00067 *target++ = kReplacementCharacter;
00068 } else {
00069 if (target + 1 >= targetEnd) {
00070 result = targetExhausted; break;
00071 };
00072 ch -= halfBase;
00073 *target++ = (ch >> halfShift) + kSurrogateHighStart;
00074 *target++ = (ch & halfMask) + kSurrogateLowStart;
00075 };
00076 };
00077 *sourceStart = source;
00078 *targetStart = target;
00079 return result;
00080 };
00081
00082
00083
00084 ConversionResult ConvertUTF16toUCS4(UTF16** sourceStart, UTF16* sourceEnd,
00085 UCS4** targetStart, const UCS4* targetEnd)
00086 {
00087 ConversionResult result = ok;
00088 register UTF16* source = *sourceStart;
00089 register UCS4* target = *targetStart;
00090 while (source < sourceEnd) {
00091 register UCS4 ch;
00092 ch = *source++;
00093 if (ch >= kSurrogateHighStart &&
00094 ch <= kSurrogateHighEnd &&
00095 source < sourceEnd) {
00096 register UCS4 ch2 = *source;
00097 if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) {
00098 ch = ((ch - kSurrogateHighStart) << halfShift)
00099 + (ch2 - kSurrogateLowStart) + halfBase;
00100 ++source;
00101 };
00102 };
00103 if (target >= targetEnd) {
00104 result = targetExhausted; break;
00105 };
00106 *target++ = ch;
00107 };
00108 *sourceStart = source;
00109 *targetStart = target;
00110 return result;
00111 };
00112
00113
00114
00115 static UCS4 offsetsFromUTF8[6] = {
00116 0x00000000UL, 0x00003080UL, 0x000E2080UL,
00117 0x03C82080UL, 0xFA082080UL, 0x82082080UL
00118 };
00119 static char bytesFromUTF8[256] = {
00120 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00121 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00122 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00123 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00124 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00125 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00126 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00127 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
00128 };
00129
00130 static UTF8 firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};
00131
00132
00133
00134
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144 int NSConvertUTF16toUTF8(unichar **sourceStart,
00145 const unichar *sourceEnd,
00146 unsigned char **targetStart,
00147 const unsigned char *targetEnd)
00148 {
00149 ConversionResult result = ok;
00150 register UTF16* source = *sourceStart;
00151 register UTF8* target = *targetStart;
00152 while (source < sourceEnd) {
00153 register UCS4 ch;
00154 register unsigned short bytesToWrite = 0;
00155 register const UCS4 byteMask = 0xBF;
00156 register const UCS4 byteMark = 0x80;
00157 ch = *source++;
00158 if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd
00159 && source < sourceEnd) {
00160 register UCS4 ch2 = *source;
00161 if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) {
00162 ch = ((ch - kSurrogateHighStart) << halfShift)
00163 + (ch2 - kSurrogateLowStart) + halfBase;
00164 ++source;
00165 };
00166 };
00167 if (ch < 0x80) { bytesToWrite = 1;
00168 } else if (ch < 0x800) { bytesToWrite = 2;
00169 } else if (ch < 0x10000) { bytesToWrite = 3;
00170 } else if (ch < 0x200000) { bytesToWrite = 4;
00171 } else if (ch < 0x4000000) { bytesToWrite = 5;
00172 } else if (ch <= kMaximumUCS4){ bytesToWrite = 6;
00173 } else { bytesToWrite = 2;
00174 ch = kReplacementCharacter;
00175 };
00176
00177 target += bytesToWrite;
00178 if (target > targetEnd) {
00179 target -= bytesToWrite; result = targetExhausted; break;
00180 };
00181 switch (bytesToWrite) {
00182 case 6: *--target = (ch | byteMark) & byteMask; ch >>= 6;
00183 case 5: *--target = (ch | byteMark) & byteMask; ch >>= 6;
00184 case 4: *--target = (ch | byteMark) & byteMask; ch >>= 6;
00185 case 3: *--target = (ch | byteMark) & byteMask; ch >>= 6;
00186 case 2: *--target = (ch | byteMark) & byteMask; ch >>= 6;
00187 case 1: *--target = ch | firstByteMark[bytesToWrite];
00188 };
00189 target += bytesToWrite;
00190 };
00191 *sourceStart = source;
00192 *targetStart = target;
00193
00194 return result;
00195 };
00196
00197
00198
00199 int NSConvertUTF8toUTF16(unsigned char **sourceStart, unsigned char *sourceEnd,
00200 unichar **targetStart, const unichar *targetEnd)
00201 {
00202 ConversionResult result = ok;
00203 register UTF8 *source = *sourceStart;
00204 register UTF16 *target = *targetStart;
00205
00206 while (source < sourceEnd) {
00207 register UCS4 ch = 0;
00208 register unsigned short extraBytesToWrite = bytesFromUTF8[*source];
00209
00210 if (source + extraBytesToWrite > sourceEnd) {
00211 result = sourceExhausted; break;
00212 };
00213 switch(extraBytesToWrite) {
00214 case 5: ch += *source++; ch <<= 6;
00215 case 4: ch += *source++; ch <<= 6;
00216 case 3: ch += *source++; ch <<= 6;
00217 case 2: ch += *source++; ch <<= 6;
00218 case 1: ch += *source++; ch <<= 6;
00219 case 0: ch += *source++;
00220 };
00221 ch -= offsetsFromUTF8[extraBytesToWrite];
00222
00223 if (target >= targetEnd) {
00224 result = targetExhausted; break;
00225 };
00226 if (ch <= kMaximumUCS2) {
00227 *target++ = ch;
00228 } else if (ch > kMaximumUTF16) {
00229 *target++ = kReplacementCharacter;
00230 } else {
00231 if (target + 1 >= targetEnd) {
00232 result = targetExhausted; break;
00233 };
00234 ch -= halfBase;
00235 *target++ = (ch >> halfShift) + kSurrogateHighStart;
00236 *target++ = (ch & halfMask) + kSurrogateLowStart;
00237 };
00238 };
00239 *sourceStart = source;
00240 *targetStart = target;
00241
00242 return result;
00243 };
00244
00245
00246 ConversionResult ConvertUCS4toUTF8 ( UCS4** sourceStart, const UCS4* sourceEnd,
00247 UTF8** targetStart, const UTF8* targetEnd)
00248 {
00249 ConversionResult result = ok;
00250 register UCS4* source = *sourceStart;
00251 register UTF8* target = *targetStart;
00252 while (source < sourceEnd) {
00253 register UCS4 ch;
00254 register unsigned short bytesToWrite = 0;
00255 register const UCS4 byteMask = 0xBF;
00256 register const UCS4 byteMark = 0x80;
00257 ch = *source++;
00258 if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd
00259 && source < sourceEnd) {
00260 register UCS4 ch2 = *source;
00261 if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) {
00262 ch = ((ch - kSurrogateHighStart) << halfShift)
00263 + (ch2 - kSurrogateLowStart) + halfBase;
00264 ++source;
00265 };
00266 };
00267 if (ch < 0x80) { bytesToWrite = 1;
00268 } else if (ch < 0x800) { bytesToWrite = 2;
00269 } else if (ch < 0x10000) { bytesToWrite = 3;
00270 } else if (ch < 0x200000) { bytesToWrite = 4;
00271 } else if (ch < 0x4000000) { bytesToWrite = 5;
00272 } else if (ch <= kMaximumUCS4){ bytesToWrite = 6;
00273 } else { bytesToWrite = 2;
00274 ch = kReplacementCharacter;
00275 };
00276
00277 target += bytesToWrite;
00278 if (target > targetEnd) {
00279 target -= bytesToWrite; result = targetExhausted; break;
00280 };
00281 switch (bytesToWrite) {
00282 case 6: *--target = (ch | byteMark) & byteMask; ch >>= 6;
00283 case 5: *--target = (ch | byteMark) & byteMask; ch >>= 6;
00284 case 4: *--target = (ch | byteMark) & byteMask; ch >>= 6;
00285 case 3: *--target = (ch | byteMark) & byteMask; ch >>= 6;
00286 case 2: *--target = (ch | byteMark) & byteMask; ch >>= 6;
00287 case 1: *--target = ch | firstByteMark[bytesToWrite];
00288 };
00289 target += bytesToWrite;
00290 };
00291 *sourceStart = source;
00292 *targetStart = target;
00293 return result;
00294 };
00295
00296
00297
00298 ConversionResult ConvertUTF8toUCS4 (UTF8** sourceStart, UTF8* sourceEnd,
00299 UCS4** targetStart, const UCS4* targetEnd)
00300 {
00301 ConversionResult result = ok;
00302 register UTF8* source = *sourceStart;
00303 register UCS4* target = *targetStart;
00304 while (source < sourceEnd) {
00305 register UCS4 ch = 0;
00306 register unsigned short extraBytesToWrite = bytesFromUTF8[*source];
00307 if (source + extraBytesToWrite > sourceEnd) {
00308 result = sourceExhausted; break;
00309 };
00310 switch(extraBytesToWrite) {
00311 case 5: ch += *source++; ch <<= 6;
00312 case 4: ch += *source++; ch <<= 6;
00313 case 3: ch += *source++; ch <<= 6;
00314 case 2: ch += *source++; ch <<= 6;
00315 case 1: ch += *source++; ch <<= 6;
00316 case 0: ch += *source++;
00317 };
00318 ch -= offsetsFromUTF8[extraBytesToWrite];
00319
00320 if (target >= targetEnd) {
00321 result = targetExhausted; break;
00322 };
00323 if (ch <= kMaximumUCS2) {
00324 *target++ = ch;
00325 } else if (ch > kMaximumUCS4) {
00326 *target++ = kReplacementCharacter;
00327 } else {
00328 if (target + 1 >= targetEnd) {
00329 result = targetExhausted; break;
00330 };
00331 ch -= halfBase;
00332 *target++ = (ch >> halfShift) + kSurrogateHighStart;
00333 *target++ = (ch & halfMask) + kSurrogateLowStart;
00334 };
00335 };
00336 *sourceStart = source;
00337 *targetStart = target;
00338 return result;
00339 };