cvtutf.c
Go to the documentation of this file.
00001 /* ================================================================ */
00002 /*
00003 File:   ConvertUTF.C
00004 Author: Mark E. Davis
00005 Copyright (C) 1994 Taligent, Inc. All rights reserved.
00006 
00007 This code is copyrighted. Under the copyright laws, this code may not
00008 be copied, in whole or part, without prior written consent of Taligent. 
00009 
00010 Taligent grants the right to use or reprint this code as long as this
00011 ENTIRE copyright notice is reproduced in the code or reproduction.
00012 The code is provided AS-IS, AND TALIGENT DISCLAIMS ALL WARRANTIES,
00013 EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO IMPLIED
00014 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  IN
00015 NO EVENT WILL TALIGENT BE LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING,
00016 WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS
00017 INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY
00018 LOSS) ARISING OUT OF THE USE OR INABILITY TO USE THIS CODE, EVEN
00019 IF TALIGENT HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
00020 BECAUSE SOME STATES DO NOT ALLOW THE EXCLUSION OR LIMITATION OF
00021 LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE
00022 LIMITATION MAY NOT APPLY TO YOU.
00023 
00024 RESTRICTED RIGHTS LEGEND: Use, duplication, or disclosure by the
00025 government is subject to restrictions as set forth in subparagraph
00026 (c)(l)(ii) of the Rights in Technical Data and Computer Software
00027 clause at DFARS 252.227-7013 and FAR 52.227-19.
00028 
00029 This code may be protected by one or more U.S. and International
00030 Patents.
00031 
00032 TRADEMARKS: Taligent and the Taligent Design Mark are registered
00033 trademarks of Taligent, Inc.
00034 */
00035 /* ================================================================ */
00036 
00037 #include "cvtutf.h"
00038 
00039 /* ================================================================ */
00040 
00041 static const int halfShift             = 10;
00042 static const UCS4 halfBase             = 0x0010000UL;
00043 static const UCS4 halfMask             = 0x3FFUL;
00044 static const UCS4 kSurrogateHighStart  = 0xD800UL;
00045 static const UCS4 kSurrogateHighEnd    = 0xDBFFUL;
00046 static const UCS4 kSurrogateLowStart   = 0xDC00UL;
00047 static const UCS4 kSurrogateLowEnd     = 0xDFFFUL;
00048 
00049 /* ================================================================ */
00050 
00051 ConversionResult
00052 ConvertUCS4toUTF16(UCS4** sourceStart, const UCS4* sourceEnd, 
00053                    UTF16** targetStart, const UTF16* targetEnd)
00054 {
00055   ConversionResult result = ok;
00056   register UCS4* source = *sourceStart;
00057   register UTF16* target = *targetStart;
00058   while (source < sourceEnd) {
00059     register UCS4 ch;
00060     if (target >= targetEnd) {
00061       result = targetExhausted; break;
00062     };
00063     ch = *source++;
00064     if (ch <= kMaximumUCS2) {
00065       *target++ = ch;
00066     } else if (ch > kMaximumUTF16) {
00067       *target++ = kReplacementCharacter;
00068     } else {
00069       if (target + 1 >= targetEnd) {
00070         result = targetExhausted; break;
00071       };
00072       ch -= halfBase;
00073       *target++ = (ch >> halfShift) + kSurrogateHighStart;
00074       *target++ = (ch & halfMask) + kSurrogateLowStart;
00075     };
00076   };
00077   *sourceStart = source;
00078   *targetStart = target;
00079   return result;
00080 };
00081 
00082 /* ================================================================ */
00083 
00084 ConversionResult ConvertUTF16toUCS4(UTF16** sourceStart, UTF16* sourceEnd, 
00085                                     UCS4** targetStart, const UCS4* targetEnd)
00086 {
00087   ConversionResult result = ok;
00088   register UTF16* source = *sourceStart;
00089   register UCS4* target = *targetStart;
00090   while (source < sourceEnd) {
00091     register UCS4 ch;
00092     ch = *source++;
00093     if (ch >= kSurrogateHighStart &&
00094         ch <= kSurrogateHighEnd &&
00095         source < sourceEnd) {
00096       register UCS4 ch2 = *source;
00097       if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) {
00098         ch = ((ch - kSurrogateHighStart) << halfShift)
00099           + (ch2 - kSurrogateLowStart) + halfBase;
00100         ++source;
00101       };
00102     };
00103     if (target >= targetEnd) {
00104       result = targetExhausted; break;
00105     };
00106     *target++ = ch;
00107   };
00108   *sourceStart = source;
00109   *targetStart = target;
00110   return result;
00111 };
00112 
00113 /* ================================================================ */
00114 
00115 static UCS4 offsetsFromUTF8[6] = {
00116   0x00000000UL, 0x00003080UL, 0x000E2080UL, 
00117   0x03C82080UL, 0xFA082080UL, 0x82082080UL
00118 };
00119 static char bytesFromUTF8[256] = {
00120   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00121   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00122   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00123   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00124   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00125   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00126   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00127   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
00128 };
00129 
00130 static UTF8 firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};
00131 
00132 /* ================================================================ */
00133 /*      This code is similar in effect to making successive calls on the
00134 mbtowc and wctomb routines in FSS-UTF. However, it is considerably
00135 different in code:
00136 * it is adapted to be consistent with UTF16,
00137 * the interface converts a whole buffer to avoid function-call overhead
00138 * constants have been gathered.
00139 * loops & conditionals have been removed as much as possible for
00140 efficiency, in favor of drop-through switch statements.
00141 */
00142 
00143 /* ================================================================ */
00144 int NSConvertUTF16toUTF8(unichar             **sourceStart,
00145                          const unichar       *sourceEnd, 
00146                          unsigned char       **targetStart,
00147                          const unsigned char *targetEnd)
00148 {
00149   ConversionResult result = ok;
00150   register UTF16* source = *sourceStart;
00151   register UTF8* target = *targetStart;
00152   while (source < sourceEnd) {
00153     register UCS4 ch;
00154     register unsigned short bytesToWrite = 0;
00155     register const UCS4 byteMask = 0xBF;
00156     register const UCS4 byteMark = 0x80; 
00157     ch = *source++;
00158     if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd
00159         && source < sourceEnd) {
00160       register UCS4 ch2 = *source;
00161       if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) {
00162         ch = ((ch - kSurrogateHighStart) << halfShift)
00163           + (ch2 - kSurrogateLowStart) + halfBase;
00164         ++source;
00165       };
00166     };
00167     if (ch < 0x80) {                    bytesToWrite = 1;
00168     } else if (ch < 0x800) {            bytesToWrite = 2;
00169     } else if (ch < 0x10000) {          bytesToWrite = 3;
00170     } else if (ch < 0x200000) {         bytesToWrite = 4;
00171     } else if (ch < 0x4000000) {        bytesToWrite = 5;
00172     } else if (ch <= kMaximumUCS4){     bytesToWrite = 6;
00173     } else {                                            bytesToWrite = 2;
00174     ch = kReplacementCharacter;
00175     }; /* I wish there were a smart way to avoid this conditional */
00176                 
00177     target += bytesToWrite;
00178     if (target > targetEnd) {
00179       target -= bytesToWrite; result = targetExhausted; break;
00180     };
00181     switch (bytesToWrite) {     /* note: code falls through cases! */
00182       case 6:   *--target = (ch | byteMark) & byteMask; ch >>= 6;
00183       case 5:   *--target = (ch | byteMark) & byteMask; ch >>= 6;
00184       case 4:   *--target = (ch | byteMark) & byteMask; ch >>= 6;
00185       case 3:   *--target = (ch | byteMark) & byteMask; ch >>= 6;
00186       case 2:   *--target = (ch | byteMark) & byteMask; ch >>= 6;
00187       case 1:   *--target =  ch | firstByteMark[bytesToWrite];
00188     };
00189     target += bytesToWrite;
00190   };
00191   *sourceStart = source;
00192   *targetStart = target;
00193 
00194   return result;
00195 };
00196 
00197 /* ================================================================ */
00198 
00199 int NSConvertUTF8toUTF16(unsigned char **sourceStart, unsigned char *sourceEnd, 
00200                          unichar **targetStart, const unichar *targetEnd)
00201 {
00202   ConversionResult result = ok;
00203   register UTF8  *source = *sourceStart;
00204   register UTF16 *target = *targetStart;
00205   
00206   while (source < sourceEnd) {
00207     register UCS4 ch = 0;
00208     register unsigned short extraBytesToWrite = bytesFromUTF8[*source];
00209 
00210     if (source + extraBytesToWrite > sourceEnd) {
00211       result = sourceExhausted; break;
00212     };
00213     switch(extraBytesToWrite) { /* note: code falls through cases! */
00214       case 5:   ch += *source++; ch <<= 6;
00215       case 4:   ch += *source++; ch <<= 6;
00216       case 3:   ch += *source++; ch <<= 6;
00217       case 2:   ch += *source++; ch <<= 6;
00218       case 1:   ch += *source++; ch <<= 6;
00219       case 0:   ch += *source++;
00220     };
00221     ch -= offsetsFromUTF8[extraBytesToWrite];
00222     
00223     if (target >= targetEnd) {
00224       result = targetExhausted; break;
00225     };
00226     if (ch <= kMaximumUCS2) {
00227       *target++ = ch;
00228     } else if (ch > kMaximumUTF16) {
00229       *target++ = kReplacementCharacter;
00230     } else {
00231       if (target + 1 >= targetEnd) {
00232         result = targetExhausted; break;
00233       };
00234       ch -= halfBase;
00235       *target++ = (ch >> halfShift) + kSurrogateHighStart;
00236       *target++ = (ch & halfMask) + kSurrogateLowStart;
00237     };
00238   };
00239   *sourceStart = source;
00240   *targetStart = target;
00241   
00242   return result;
00243 };
00244 
00245 /* ================================================================ */
00246 ConversionResult ConvertUCS4toUTF8 ( UCS4** sourceStart, const UCS4* sourceEnd, 
00247                                      UTF8** targetStart, const UTF8* targetEnd)
00248 {
00249   ConversionResult result = ok;
00250   register UCS4* source = *sourceStart;
00251   register UTF8* target = *targetStart;
00252   while (source < sourceEnd) {
00253     register UCS4 ch;
00254     register unsigned short bytesToWrite = 0;
00255     register const UCS4 byteMask = 0xBF;
00256     register const UCS4 byteMark = 0x80; 
00257     ch = *source++;
00258     if (ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd
00259         && source < sourceEnd) {
00260       register UCS4 ch2 = *source;
00261       if (ch2 >= kSurrogateLowStart && ch2 <= kSurrogateLowEnd) {
00262         ch = ((ch - kSurrogateHighStart) << halfShift)
00263           + (ch2 - kSurrogateLowStart) + halfBase;
00264         ++source;
00265       };
00266     };
00267     if (ch < 0x80) {                            bytesToWrite = 1;
00268     } else if (ch < 0x800) {            bytesToWrite = 2;
00269     } else if (ch < 0x10000) {          bytesToWrite = 3;
00270     } else if (ch < 0x200000) {         bytesToWrite = 4;
00271     } else if (ch < 0x4000000) {        bytesToWrite = 5;
00272     } else if (ch <= kMaximumUCS4){     bytesToWrite = 6;
00273     } else {                                            bytesToWrite = 2;
00274     ch = kReplacementCharacter;
00275     }; /* I wish there were a smart way to avoid this conditional */
00276                 
00277     target += bytesToWrite;
00278     if (target > targetEnd) {
00279       target -= bytesToWrite; result = targetExhausted; break;
00280     };
00281     switch (bytesToWrite) {     /* note: code falls through cases! */
00282       case 6:   *--target = (ch | byteMark) & byteMask; ch >>= 6;
00283       case 5:   *--target = (ch | byteMark) & byteMask; ch >>= 6;
00284       case 4:   *--target = (ch | byteMark) & byteMask; ch >>= 6;
00285       case 3:   *--target = (ch | byteMark) & byteMask; ch >>= 6;
00286       case 2:   *--target = (ch | byteMark) & byteMask; ch >>= 6;
00287       case 1:   *--target =  ch | firstByteMark[bytesToWrite];
00288     };
00289     target += bytesToWrite;
00290   };
00291   *sourceStart = source;
00292   *targetStart = target;
00293   return result;
00294 };
00295 
00296 /* ================================================================ */
00297 
00298 ConversionResult ConvertUTF8toUCS4 (UTF8** sourceStart, UTF8* sourceEnd, 
00299                                     UCS4** targetStart, const UCS4* targetEnd)
00300 {
00301   ConversionResult result = ok;
00302   register UTF8* source = *sourceStart;
00303   register UCS4* target = *targetStart;
00304   while (source < sourceEnd) {
00305     register UCS4 ch = 0;
00306     register unsigned short extraBytesToWrite = bytesFromUTF8[*source];
00307     if (source + extraBytesToWrite > sourceEnd) {
00308       result = sourceExhausted; break;
00309     };
00310     switch(extraBytesToWrite) { /* note: code falls through cases! */
00311       case 5:   ch += *source++; ch <<= 6;
00312       case 4:   ch += *source++; ch <<= 6;
00313       case 3:   ch += *source++; ch <<= 6;
00314       case 2:   ch += *source++; ch <<= 6;
00315       case 1:   ch += *source++; ch <<= 6;
00316       case 0:   ch += *source++;
00317     };
00318     ch -= offsetsFromUTF8[extraBytesToWrite];
00319 
00320     if (target >= targetEnd) {
00321       result = targetExhausted; break;
00322     };
00323     if (ch <= kMaximumUCS2) {
00324       *target++ = ch;
00325     } else if (ch > kMaximumUCS4) {
00326       *target++ = kReplacementCharacter;
00327     } else {
00328       if (target + 1 >= targetEnd) {
00329         result = targetExhausted; break;
00330       };
00331       ch -= halfBase;
00332       *target++ = (ch >> halfShift) + kSurrogateHighStart;
00333       *target++ = (ch & halfMask) + kSurrogateLowStart;
00334     };
00335   };
00336   *sourceStart = source;
00337   *targetStart = target;
00338   return result;
00339 };


pedal_monitor
Author(s): Pedro Mendes
autogenerated on Fri Jun 6 2014 18:37:21