/////////////////////////////////////////////////////////////////////////////// // // Copyright (C) 2008-2010 Artyom Beilis (Tonkikh) // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Lesser General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License // along with this program. If not, see . // /////////////////////////////////////////////////////////////////////////////// #ifndef CPPCMS_UTF_ITERATOR_H #define CPPCMS_UTF_ITERATOR_H #include "cstdint.h" #include namespace cppcms { namespace utf { static const uint32_t illegal = 0xFFFFFFFFu; inline bool valid(uint32_t v) { if(v>0x10FFFF) return false; if(0xD800 <=v && v<= 0xDFFF) // surragates return false; return true; } } namespace utf8 { // See RFC 3629 // Based on: http://www.w3.org/International/questions/qa-forms-utf-8 template uint32_t next(Iterator &p,Iterator e,bool html=false,bool decode=false) { unsigned char c=*p++; unsigned char seq0,seq1=0,seq2=0,seq3=0; seq0=c; int len=1; if((c & 0xC0) == 0xC0) { if(p==e) return utf::illegal; seq1=*p++; len=2; } if((c & 0xE0) == 0xE0) { if(p==e) return utf::illegal; seq2=*p++; len=3; } if((c & 0xF0) == 0xF0) { if(p==e) return utf::illegal; seq3=*p++; len=4; } switch(len) { case 1: // ASCII -- remove codes for HTML only if(!html || seq0==0x9 || seq0==0x0A || seq0==0x0D || (0x20<=seq0 && seq0<=0x7E)) break; return utf::illegal; case 2: // non-overloading 2 bytes if(0xC2 <= seq0 && seq0 <= 0xDF) { if(html && seq0==0xC2 && seq1<=0x9F) return utf::illegal; // C1 is illegal if(0x80 <= seq1 && seq1<= 0xBF) break; } return utf::illegal; case 3: if(seq0==0xE0) { // exclude overloadings if(0xA0 <=seq1 && seq1<= 0xBF && 0x80 <=seq2 && seq2<=0xBF) break; } else if( (0xE1 <= seq0 && seq0 <=0xEC) || seq0==0xEE || seq0==0xEF) { // stright 3 bytes if( 0x80 <=seq1 && seq1<=0xBF && 0x80 <=seq2 && seq2<=0xBF) break; } else if(seq0 == 0xED) { // exclude surrogates if( 0x80 <=seq1 && seq1<=0x9F && 0x80 <=seq2 && seq2<=0xBF) break; } return utf::illegal; case 4: switch(seq0) { case 0xF0: // planes 1-3 if( 0x90 <=seq1 && seq1<=0xBF && 0x80 <=seq2 && seq2<=0xBF && 0x80 <=seq3 && seq3<=0xBF) break; return utf::illegal; case 0xF1: // planes 4-15 case 0xF2: case 0xF3: if( 0x80 <=seq1 && seq1<=0xBF && 0x80 <=seq2 && seq2<=0xBF && 0x80 <=seq3 && seq3<=0xBF) break; return utf::illegal; case 0xF4: // pane 16 if( 0x80 <=seq1 && seq1<=0x8F && 0x80 <=seq2 && seq2<=0xBF && 0x80 <=seq3 && seq3<=0xBF) break; return utf::illegal; default: return utf::illegal; } } if(!decode) return 1; switch(len) { case 1: return seq0; case 2: return ((seq0 & 0x1F) << 6) | (seq1 & 0x3F); case 3: return ((seq0 & 0x0F) << 12) | ((seq1 & 0x3F) << 6) | (seq2 & 0x3F) ; case 4: return ((seq0 & 0x07) << 18) | ((seq1 & 0x3F) << 12) | ((seq2 & 0x3F) << 6) | (seq3 & 0x3F) ; } return utf::illegal; } // valid template bool validate(Iterator p,Iterator e,size_t &count,bool html=false) { while(p!=e) { if(next(p,e,html)==utf::illegal) return false; count++; } return true; } template bool validate(Iterator p,Iterator e,bool html=false) { while(p!=e) if(next(p,e,html)==utf::illegal) return false; return true; } inline int width(uint32_t value) { if(value <=0x7F) { return 1; } else if(value <=0x7FF) { return 2; } else if(value <=0xFFFF) { return 3; } else { return 4; } } struct seq { char c[4]; unsigned len; }; inline seq encode(uint32_t value) { seq out={ {0} }; if(value <=0x7F) { out.c[0]=value; out.len=1; } else if(value <=0x7FF) { out.c[0]=(value >> 6) | 0xC0; out.c[1]=(value & 0x3F) | 0x80; out.len=2; } else if(value <=0xFFFF) { out.c[0]=(value >> 12) | 0xE0; out.c[1]=((value >> 6) & 0x3F) | 0x80; out.c[2]=(value & 0x3F) | 0x80; out.len=3; } else { out.c[0]=(value >> 18) | 0xF0; out.c[1]=((value >> 12) & 0x3F) | 0x80; out.c[2]=((value >> 6) & 0x3F) | 0x80; out.c[3]=(value & 0x3F) | 0x80; out.len=4; } return out; } } // namespace utf8 namespace utf16 { // See RFC 2781 inline bool is_first_surrogate(uint16_t x) { return 0xD800 <=x && x<= 0xDBFF; } inline bool is_second_surrogate(uint16_t x) { return 0xDC00 <=x && x<= 0xDFFF; } inline uint32_t combine_surrogate(uint16_t w1,uint16_t w2) { return ((uint32_t(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000; } template inline uint32_t next(It ¤t,It last) { uint16_t w1=*current++; if(w1 < 0xD800 || 0xDFFF < w1) { return w1; } if(w1 > 0xDBFF) return utf::illegal; if(current==last) return utf::illegal; uint16_t w2=*current++; if(w2 < 0xDC00 || 0xDFFF < w2) return utf::illegal; return combine_surrogate(w1,w2); } inline int width(uint32_t u) { return u>=0x100000 ? 2 : 1; } struct seq { uint16_t c[2]; unsigned len; }; inline seq encode(uint32_t u) { seq out={ {0} }; if(u<=0xFFFF) { out.c[0]=u; out.len=1; } else { u-=0x10000; out.c[0]=0xD800 | (u>>10); out.c[1]=0xDC00 | (u & 0x3FF); out.len=2; } return out; } } // utf16; } // namespace cppcms #endif