ChipMaster's trial hacks on C++CMS starting with v1.2.1. Not sure I'll follow on with the v2 since it looks to be breaking and mostly frivolous.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

270 lines
6.0 KiB

  1. ///////////////////////////////////////////////////////////////////////////////
  2. //
  3. // Copyright (C) 2008-2010 Artyom Beilis (Tonkikh) <artyomtnk@yahoo.com>
  4. //
  5. // This program is free software: you can redistribute it and/or modify
  6. // it under the terms of the GNU Lesser General Public License as published by
  7. // the Free Software Foundation, either version 3 of the License, or
  8. // (at your option) any later version.
  9. //
  10. // This program is distributed in the hope that it will be useful,
  11. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. // GNU Lesser General Public License for more details.
  14. //
  15. // You should have received a copy of the GNU Lesser General Public License
  16. // along with this program. If not, see <http://www.gnu.org/licenses/>.
  17. //
  18. ///////////////////////////////////////////////////////////////////////////////
  19. #ifndef CPPCMS_UTF_ITERATOR_H
  20. #define CPPCMS_UTF_ITERATOR_H
  21. #include "cstdint.h"
  22. #include <string.h>
  23. namespace cppcms {
  24. namespace utf {
  25. static const uint32_t illegal = 0xFFFFFFFFu;
  26. inline bool valid(uint32_t v)
  27. {
  28. if(v>0x10FFFF)
  29. return false;
  30. if(0xD800 <=v && v<= 0xDFFF) // surragates
  31. return false;
  32. return true;
  33. }
  34. }
  35. namespace utf8 {
  36. // See RFC 3629
  37. // Based on: http://www.w3.org/International/questions/qa-forms-utf-8
  38. template<typename Iterator>
  39. uint32_t next(Iterator &p,Iterator e,bool html=false,bool decode=false)
  40. {
  41. unsigned char c=*p++;
  42. unsigned char seq0,seq1=0,seq2=0,seq3=0;
  43. seq0=c;
  44. int len=1;
  45. if((c & 0xC0) == 0xC0) {
  46. if(p==e)
  47. return utf::illegal;
  48. seq1=*p++;
  49. len=2;
  50. }
  51. if((c & 0xE0) == 0xE0) {
  52. if(p==e)
  53. return utf::illegal;
  54. seq2=*p++;
  55. len=3;
  56. }
  57. if((c & 0xF0) == 0xF0) {
  58. if(p==e)
  59. return utf::illegal;
  60. seq3=*p++;
  61. len=4;
  62. }
  63. switch(len) {
  64. case 1: // ASCII -- remove codes for HTML only
  65. if(!html || seq0==0x9 || seq0==0x0A || seq0==0x0D || (0x20<=seq0 && seq0<=0x7E))
  66. break;
  67. return utf::illegal;
  68. case 2: // non-overloading 2 bytes
  69. if(0xC2 <= seq0 && seq0 <= 0xDF) {
  70. if(html && seq0==0xC2 && seq1<=0x9F)
  71. return utf::illegal; // C1 is illegal
  72. if(0x80 <= seq1 && seq1<= 0xBF)
  73. break;
  74. }
  75. return utf::illegal;
  76. case 3:
  77. if(seq0==0xE0) { // exclude overloadings
  78. if(0xA0 <=seq1 && seq1<= 0xBF && 0x80 <=seq2 && seq2<=0xBF)
  79. break;
  80. }
  81. else if( (0xE1 <= seq0 && seq0 <=0xEC) || seq0==0xEE || seq0==0xEF) { // stright 3 bytes
  82. if( 0x80 <=seq1 && seq1<=0xBF &&
  83. 0x80 <=seq2 && seq2<=0xBF)
  84. break;
  85. }
  86. else if(seq0 == 0xED) { // exclude surrogates
  87. if( 0x80 <=seq1 && seq1<=0x9F &&
  88. 0x80 <=seq2 && seq2<=0xBF)
  89. break;
  90. }
  91. return utf::illegal;
  92. case 4:
  93. switch(seq0) {
  94. case 0xF0: // planes 1-3
  95. if( 0x90 <=seq1 && seq1<=0xBF &&
  96. 0x80 <=seq2 && seq2<=0xBF &&
  97. 0x80 <=seq3 && seq3<=0xBF)
  98. break;
  99. return utf::illegal;
  100. case 0xF1: // planes 4-15
  101. case 0xF2:
  102. case 0xF3:
  103. if( 0x80 <=seq1 && seq1<=0xBF &&
  104. 0x80 <=seq2 && seq2<=0xBF &&
  105. 0x80 <=seq3 && seq3<=0xBF)
  106. break;
  107. return utf::illegal;
  108. case 0xF4: // pane 16
  109. if( 0x80 <=seq1 && seq1<=0x8F &&
  110. 0x80 <=seq2 && seq2<=0xBF &&
  111. 0x80 <=seq3 && seq3<=0xBF)
  112. break;
  113. return utf::illegal;
  114. default:
  115. return utf::illegal;
  116. }
  117. }
  118. if(!decode)
  119. return 1;
  120. switch(len) {
  121. case 1:
  122. return seq0;
  123. case 2:
  124. return ((seq0 & 0x1F) << 6) | (seq1 & 0x3F);
  125. case 3:
  126. return ((seq0 & 0x0F) << 12) | ((seq1 & 0x3F) << 6) | (seq2 & 0x3F) ;
  127. case 4:
  128. return ((seq0 & 0x07) << 18) | ((seq1 & 0x3F) << 12) | ((seq2 & 0x3F) << 6) | (seq3 & 0x3F) ;
  129. }
  130. return utf::illegal;
  131. } // valid
  132. template<typename Iterator>
  133. bool validate(Iterator p,Iterator e,size_t &count,bool html=false)
  134. {
  135. while(p!=e) {
  136. if(next(p,e,html)==utf::illegal)
  137. return false;
  138. count++;
  139. }
  140. return true;
  141. }
  142. template<typename Iterator>
  143. bool validate(Iterator p,Iterator e,bool html=false)
  144. {
  145. while(p!=e)
  146. if(next(p,e,html)==utf::illegal)
  147. return false;
  148. return true;
  149. }
  150. inline int width(uint32_t value)
  151. {
  152. if(value <=0x7F) {
  153. return 1;
  154. }
  155. else if(value <=0x7FF) {
  156. return 2;
  157. }
  158. else if(value <=0xFFFF) {
  159. return 3;
  160. }
  161. else {
  162. return 4;
  163. }
  164. }
  165. struct seq {
  166. char c[4];
  167. unsigned len;
  168. };
  169. inline seq encode(uint32_t value)
  170. {
  171. seq out={ {0} };
  172. if(value <=0x7F) {
  173. out.c[0]=value;
  174. out.len=1;
  175. }
  176. else if(value <=0x7FF) {
  177. out.c[0]=(value >> 6) | 0xC0;
  178. out.c[1]=(value & 0x3F) | 0x80;
  179. out.len=2;
  180. }
  181. else if(value <=0xFFFF) {
  182. out.c[0]=(value >> 12) | 0xE0;
  183. out.c[1]=((value >> 6) & 0x3F) | 0x80;
  184. out.c[2]=(value & 0x3F) | 0x80;
  185. out.len=3;
  186. }
  187. else {
  188. out.c[0]=(value >> 18) | 0xF0;
  189. out.c[1]=((value >> 12) & 0x3F) | 0x80;
  190. out.c[2]=((value >> 6) & 0x3F) | 0x80;
  191. out.c[3]=(value & 0x3F) | 0x80;
  192. out.len=4;
  193. }
  194. return out;
  195. }
  196. } // namespace utf8
  197. namespace utf16 {
  198. // See RFC 2781
  199. inline bool is_first_surrogate(uint16_t x)
  200. {
  201. return 0xD800 <=x && x<= 0xDBFF;
  202. }
  203. inline bool is_second_surrogate(uint16_t x)
  204. {
  205. return 0xDC00 <=x && x<= 0xDFFF;
  206. }
  207. inline uint32_t combine_surrogate(uint16_t w1,uint16_t w2)
  208. {
  209. return ((uint32_t(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
  210. }
  211. template<typename It>
  212. inline uint32_t next(It &current,It last)
  213. {
  214. uint16_t w1=*current++;
  215. if(w1 < 0xD800 || 0xDFFF < w1) {
  216. return w1;
  217. }
  218. if(w1 > 0xDBFF)
  219. return utf::illegal;
  220. if(current==last)
  221. return utf::illegal;
  222. uint16_t w2=*current++;
  223. if(w2 < 0xDC00 || 0xDFFF < w2)
  224. return utf::illegal;
  225. return combine_surrogate(w1,w2);
  226. }
  227. inline int width(uint32_t u)
  228. {
  229. return u>=0x100000 ? 2 : 1;
  230. }
  231. struct seq {
  232. uint16_t c[2];
  233. unsigned len;
  234. };
  235. inline seq encode(uint32_t u)
  236. {
  237. seq out={ {0} };
  238. if(u<=0xFFFF) {
  239. out.c[0]=u;
  240. out.len=1;
  241. }
  242. else {
  243. u-=0x10000;
  244. out.c[0]=0xD800 | (u>>10);
  245. out.c[1]=0xDC00 | (u & 0x3FF);
  246. out.len=2;
  247. }
  248. return out;
  249. }
  250. } // utf16;
  251. } // namespace cppcms
  252. #endif