ChipMaster's trial hacks on C++CMS starting with v1.2.1. Not sure I'll follow on with the v2 since it looks to be breaking and mostly frivolous.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

252 lines
4.9 KiB

  1. #ifndef CPPCMS_UTF_ITERATOR_H
  2. #define CPPCMS_UTF_ITERATOR_H
  3. #include "cstdint.h"
  4. #include <string.h>
  5. namespace cppcms {
  6. namespace utf {
  7. static const uint32_t illegal = 0xFFFFFFFFu;
  8. inline bool valid(uint32_t v)
  9. {
  10. if(v>0x10FFFF)
  11. return false;
  12. if(0xD800 <=v && v<= 0xDFFF) // surragates
  13. return false;
  14. return true;
  15. }
  16. }
  17. namespace utf8 {
  18. // See RFC 3629
  19. // Based on: http://www.w3.org/International/questions/qa-forms-utf-8
  20. template<typename Iterator>
  21. uint32_t next(Iterator &p,Iterator e,bool html=false,bool decode=false)
  22. {
  23. unsigned char c=*p++;
  24. unsigned char seq0,seq1=0,seq2=0,seq3=0;
  25. seq0=c;
  26. int len=1;
  27. if((c & 0xC0) == 0xC0) {
  28. if(p==e)
  29. return utf::illegal;
  30. seq1=*p++;
  31. len=2;
  32. }
  33. if((c & 0xE0) == 0xE0) {
  34. if(p==e)
  35. return utf::illegal;
  36. seq2=*p++;
  37. len=3;
  38. }
  39. if((c & 0xF0) == 0xF0) {
  40. if(p==e)
  41. return utf::illegal;
  42. seq3=*p++;
  43. len=4;
  44. }
  45. switch(len) {
  46. case 1: // ASCII -- remove codes for HTML only
  47. if(!html || seq0==0x9 || seq0==0x0A || seq0==0x0D || (0x20<=seq0 && seq0<=0x7E))
  48. break;
  49. return utf::illegal;
  50. case 2: // non-overloading 2 bytes
  51. if(0xC2 <= seq0 && seq0 <= 0xDF) {
  52. if(html && seq0==0xC2 && seq1<=0x9F)
  53. return utf::illegal; // C1 is illegal
  54. if(0x80 <= seq1 && seq1<= 0xBF)
  55. break;
  56. }
  57. return utf::illegal;
  58. case 3:
  59. if(seq0==0xE0) { // exclude overloadings
  60. if(0xA0 <=seq1 && seq1<= 0xBF && 0x80 <=seq2 && seq2<=0xBF)
  61. break;
  62. }
  63. else if( (0xE1 <= seq0 && seq0 <=0xEC) || seq0==0xEE || seq0==0xEF) { // stright 3 bytes
  64. if( 0x80 <=seq1 && seq1<=0xBF &&
  65. 0x80 <=seq2 && seq2<=0xBF)
  66. break;
  67. }
  68. else if(seq0 == 0xED) { // exclude surrogates
  69. if( 0x80 <=seq1 && seq1<=0x9F &&
  70. 0x80 <=seq2 && seq2<=0xBF)
  71. break;
  72. }
  73. return utf::illegal;
  74. case 4:
  75. switch(seq0) {
  76. case 0xF0: // planes 1-3
  77. if( 0x90 <=seq1 && seq1<=0xBF &&
  78. 0x80 <=seq2 && seq2<=0xBF &&
  79. 0x80 <=seq3 && seq3<=0xBF)
  80. break;
  81. return utf::illegal;
  82. case 0xF1: // planes 4-15
  83. case 0xF2:
  84. case 0xF3:
  85. if( 0x80 <=seq1 && seq1<=0xBF &&
  86. 0x80 <=seq2 && seq2<=0xBF &&
  87. 0x80 <=seq3 && seq3<=0xBF)
  88. break;
  89. return utf::illegal;
  90. case 0xF4: // pane 16
  91. if( 0x80 <=seq1 && seq1<=0x8F &&
  92. 0x80 <=seq2 && seq2<=0xBF &&
  93. 0x80 <=seq3 && seq3<=0xBF)
  94. break;
  95. return utf::illegal;
  96. default:
  97. return utf::illegal;
  98. }
  99. }
  100. if(!decode)
  101. return 1;
  102. switch(len) {
  103. case 1:
  104. return seq0;
  105. case 2:
  106. return ((seq0 & 0x1F) << 6) | (seq1 & 0x3F);
  107. case 3:
  108. return ((seq0 & 0x0F) << 12) | ((seq1 & 0x3F) << 6) | (seq2 & 0x3F) ;
  109. case 4:
  110. return ((seq0 & 0x07) << 18) | ((seq1 & 0x3F) << 12) | ((seq2 & 0x3F) << 6) | (seq3 & 0x3F) ;
  111. }
  112. return utf::illegal;
  113. } // valid
  114. template<typename Iterator>
  115. bool validate(Iterator p,Iterator e,size_t &count,bool html=false)
  116. {
  117. while(p!=e) {
  118. if(next(p,e,html)==utf::illegal)
  119. return false;
  120. count++;
  121. }
  122. return true;
  123. }
  124. template<typename Iterator>
  125. bool validate(Iterator p,Iterator e,bool html=false)
  126. {
  127. while(p!=e)
  128. if(next(p,e,html)==utf::illegal)
  129. return false;
  130. return true;
  131. }
  132. inline int width(uint32_t value)
  133. {
  134. if(value <=0x7F) {
  135. return 1;
  136. }
  137. else if(value <=0x7FF) {
  138. return 2;
  139. }
  140. else if(value <=0xFFFF) {
  141. return 3;
  142. }
  143. else {
  144. return 4;
  145. }
  146. }
  147. struct seq {
  148. char c[4];
  149. unsigned len;
  150. };
  151. inline seq encode(uint32_t value)
  152. {
  153. seq out={ {0} };
  154. if(value <=0x7F) {
  155. out.c[0]=value;
  156. out.len=1;
  157. }
  158. else if(value <=0x7FF) {
  159. out.c[0]=(value >> 6) | 0xC0;
  160. out.c[1]=value & 0x3F | 0x80;
  161. out.len=2;
  162. }
  163. else if(value <=0xFFFF) {
  164. out.c[0]=(value >> 12) | 0xE0;
  165. out.c[1]=(value >> 6) & 0x3F | 0x80;
  166. out.c[2]=value & 0x3F | 0x80;
  167. out.len=3;
  168. }
  169. else {
  170. out.c[0]=(value >> 18) | 0xF0;
  171. out.c[1]=(value >> 12) & 0x3F | 0x80;
  172. out.c[2]=(value >> 6) & 0x3F | 0x80;
  173. out.c[3]=value & 0x3F | 0x80;
  174. out.len=4;
  175. }
  176. return out;
  177. }
  178. } // namespace utf8
  179. namespace utf16 {
  180. // See RFC 2781
  181. inline bool is_first_surrogate(uint16_t x)
  182. {
  183. return 0xD800 <=x && x<= 0xDBFF;
  184. }
  185. inline bool is_second_surrogate(uint16_t x)
  186. {
  187. return 0xDC00 <=x && x<= 0xDFFF;
  188. }
  189. inline uint32_t combine_surrogate(uint16_t w1,uint16_t w2)
  190. {
  191. return (uint32_t(w1 & 0x3FF) << 10) | (w2 & 0x3FF) | 0x100000;
  192. }
  193. template<typename It>
  194. inline uint32_t next(It &current,It last)
  195. {
  196. uint16_t w1=*current++;
  197. if(w1 < 0xD800 || 0xDFFF < w1) {
  198. return w1;
  199. }
  200. if(w1 > 0xDBFF)
  201. return utf::illegal;
  202. if(current==last)
  203. return utf::illegal;
  204. uint16_t w2=*current++;
  205. if(w2 < 0xDC00 || 0xDFFF < w2)
  206. return utf::illegal;
  207. return combine_surrogate(w1,w2);
  208. }
  209. inline int width(uint32_t u)
  210. {
  211. return u>=0x100000 ? 2 : 1;
  212. }
  213. struct seq {
  214. uint16_t c[2];
  215. unsigned len;
  216. };
  217. inline seq encode(uint32_t u)
  218. {
  219. seq out={ {0} };
  220. if(u<=0xFFFF) {
  221. out.c[0]=u;
  222. out.len=1;
  223. }
  224. else {
  225. u-=0x10000;
  226. out.c[0]=0xD800 | (u>>10);
  227. out.c[1]=0xDC00 | (u & 0x3FF);
  228. out.len=2;
  229. }
  230. return out;
  231. }
  232. } // utf16;
  233. } // namespace cppcms
  234. #endif