ChipMaster's trial hacks on C++CMS starting with v1.2.1. Not sure I'll follow on with the v2 since it looks to be breaking and mostly frivolous.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

461 lines
13 KiB

  1. //
  2. // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0. (See
  5. // accompanying file LICENSE_1_0.txt or copy at
  6. // http://www.boost.org/LICENSE_1_0.txt)
  7. //
  8. #ifndef BOOSTER_LOCALE_UTF_H_INCLUDED
  9. #define BOOSTER_LOCALE_UTF_H_INCLUDED
  10. #include <booster/cstdint.h>
  11. namespace booster {
  12. namespace locale {
  13. ///
  14. /// \brief Namespace that holds basic operations on UTF encoded sequences
  15. ///
  16. /// All functions defined in this namespace do not require linking with Boost.Locale library
  17. ///
  18. namespace utf {
  19. /// \cond INTERNAL
  20. #ifdef __GNUC__
  21. # define BOOSTER_LOCALE_LIKELY(x) __builtin_expect((x),1)
  22. # define BOOSTER_LOCALE_UNLIKELY(x) __builtin_expect((x),0)
  23. #else
  24. # define BOOSTER_LOCALE_LIKELY(x) (x)
  25. # define BOOSTER_LOCALE_UNLIKELY(x) (x)
  26. #endif
  27. /// \endcond
  28. ///
  29. /// \brief The integral type type that can hold a Unicode code point
  30. ///
  31. typedef uint32_t code_point;
  32. ///
  33. /// \brief Special constant that defines illegal code point
  34. ///
  35. static const code_point illegal = 0xFFFFFFFFu;
  36. ///
  37. /// \brief Special constant that defines incomplete code point
  38. ///
  39. static const code_point incomplete = 0xFFFFFFFEu;
  40. ///
  41. /// \brief the function checks if \a v is a valid code point
  42. ///
  43. inline bool is_valid_codepoint(code_point v)
  44. {
  45. if(v>0x10FFFF)
  46. return false;
  47. if(0xD800 <=v && v<= 0xDFFF) // surragates
  48. return false;
  49. return true;
  50. }
  51. #ifdef BOOSTER_LOCALE_DOXYGEN
  52. ///
  53. /// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points
  54. ///
  55. template<typename CharType,int size=sizeof(CharType)>
  56. struct utf_traits {
  57. ///
  58. /// The type of the character
  59. ///
  60. typedef CharType char_type;
  61. ///
  62. /// Read one code point from the range [p,e) and return it.
  63. ///
  64. /// - If the sequence that was read is incomplete sequence returns \ref incomplete,
  65. /// - If illegal sequence detected returns \ref illegal
  66. ///
  67. /// Requirements
  68. ///
  69. /// - Iterator is valid input iterator
  70. ///
  71. /// Postconditions
  72. ///
  73. /// - p points to the last consumed character
  74. ///
  75. template<typename Iterator>
  76. static code_point decode(Iterator &p,Iterator e);
  77. ///
  78. /// Maximal width of valid sequence in the code units:
  79. ///
  80. /// - UTF-8 - 4
  81. /// - UTF-16 - 2
  82. /// - UTF-32 - 1
  83. ///
  84. static const int max_width;
  85. ///
  86. /// The width of specific code point in the code units.
  87. ///
  88. /// Requirement: value is a valid Unicode code point
  89. /// Returns value in range [1..max_width]
  90. ///
  91. static int width(code_point value);
  92. ///
  93. /// Get the size of the trail part of variable length encoded sequence.
  94. ///
  95. /// Returns -1 if C is not valid lead character
  96. ///
  97. static int trail_length(char_type c);
  98. ///
  99. /// Returns true if c is trail code unit, always false for UTF-32
  100. ///
  101. static bool is_trail(char_type c);
  102. ///
  103. /// Returns true if c is lead code unit, always true of UTF-32
  104. ///
  105. static bool is_lead(char_type c);
  106. ///
  107. /// Convert valid Unicode code point \a value to the UTF sequence.
  108. ///
  109. /// Requirements:
  110. ///
  111. /// - \a value is valid code point
  112. /// - \a out is an output iterator should be able to accept at least width(value) units
  113. ///
  114. /// Returns the iterator past the last written code unit.
  115. ///
  116. template<typename Iterator>
  117. static Iterator encode(code_point value,Iterator out);
  118. ///
  119. /// Decodes valid UTF sequence that is pointed by p into code point.
  120. ///
  121. /// If the sequence is invalid or points to end the behavior is undefined
  122. ///
  123. template<typename Iterator>
  124. static code_point decode_valid(Iterator &p);
  125. };
  126. #else
  127. template<typename CharType,int size=sizeof(CharType)>
  128. struct utf_traits;
  129. template<typename CharType>
  130. struct utf_traits<CharType,1> {
  131. typedef CharType char_type;
  132. static int trail_length(char_type ci)
  133. {
  134. unsigned char c = ci;
  135. if(c < 128)
  136. return 0;
  137. if(BOOSTER_LOCALE_UNLIKELY(c < 194))
  138. return -1;
  139. if(c < 224)
  140. return 1;
  141. if(c < 240)
  142. return 2;
  143. if(BOOSTER_LOCALE_LIKELY(c <=244))
  144. return 3;
  145. return -1;
  146. }
  147. static const int max_width = 4;
  148. static int width(code_point value)
  149. {
  150. if(value <=0x7F) {
  151. return 1;
  152. }
  153. else if(value <=0x7FF) {
  154. return 2;
  155. }
  156. else if(BOOSTER_LOCALE_LIKELY(value <=0xFFFF)) {
  157. return 3;
  158. }
  159. else {
  160. return 4;
  161. }
  162. }
  163. static bool is_trail(char_type ci)
  164. {
  165. unsigned char c=ci;
  166. return (c & 0xC0)==0x80;
  167. }
  168. static bool is_lead(char_type ci)
  169. {
  170. return !is_trail(ci);
  171. }
  172. template<typename Iterator>
  173. static code_point decode(Iterator &p,Iterator e)
  174. {
  175. if(BOOSTER_LOCALE_UNLIKELY(p==e))
  176. return incomplete;
  177. unsigned char lead = *p++;
  178. // First byte is fully validated here
  179. int trail_size = trail_length(lead);
  180. if(BOOSTER_LOCALE_UNLIKELY(trail_size < 0))
  181. return illegal;
  182. //
  183. // Ok as only ASCII may be of size = 0
  184. // also optimize for ASCII text
  185. //
  186. if(trail_size == 0)
  187. return lead;
  188. code_point c = lead & ((1<<(6-trail_size))-1);
  189. // Read the rest
  190. unsigned char tmp;
  191. switch(trail_size) {
  192. case 3:
  193. if(BOOSTER_LOCALE_UNLIKELY(p==e))
  194. return incomplete;
  195. tmp = *p++;
  196. if (!is_trail(tmp))
  197. return illegal;
  198. c = (c << 6) | ( tmp & 0x3F);
  199. case 2:
  200. if(BOOSTER_LOCALE_UNLIKELY(p==e))
  201. return incomplete;
  202. tmp = *p++;
  203. if (!is_trail(tmp))
  204. return illegal;
  205. c = (c << 6) | ( tmp & 0x3F);
  206. case 1:
  207. if(BOOSTER_LOCALE_UNLIKELY(p==e))
  208. return incomplete;
  209. tmp = *p++;
  210. if (!is_trail(tmp))
  211. return illegal;
  212. c = (c << 6) | ( tmp & 0x3F);
  213. }
  214. // Check code point validity: no surrogates and
  215. // valid range
  216. if(BOOSTER_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
  217. return illegal;
  218. // make sure it is the most compact representation
  219. if(BOOSTER_LOCALE_UNLIKELY(width(c)!=trail_size + 1))
  220. return illegal;
  221. return c;
  222. }
  223. template<typename Iterator>
  224. static code_point decode_valid(Iterator &p)
  225. {
  226. unsigned char lead = *p++;
  227. if(lead < 192)
  228. return lead;
  229. int trail_size;
  230. if(lead < 224)
  231. trail_size = 1;
  232. else if(BOOSTER_LOCALE_LIKELY(lead < 240)) // non-BMP rare
  233. trail_size = 2;
  234. else
  235. trail_size = 3;
  236. code_point c = lead & ((1<<(6-trail_size))-1);
  237. switch(trail_size) {
  238. case 3:
  239. c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
  240. case 2:
  241. c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
  242. case 1:
  243. c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F);
  244. }
  245. return c;
  246. }
  247. template<typename Iterator>
  248. static Iterator encode(code_point value,Iterator out)
  249. {
  250. if(value <= 0x7F) {
  251. *out++ = static_cast<char_type>(value);
  252. }
  253. else if(value <= 0x7FF) {
  254. *out++ = static_cast<char_type>((value >> 6) | 0xC0);
  255. *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
  256. }
  257. else if(BOOSTER_LOCALE_LIKELY(value <= 0xFFFF)) {
  258. *out++ = static_cast<char_type>((value >> 12) | 0xE0);
  259. *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
  260. *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
  261. }
  262. else {
  263. *out++ = static_cast<char_type>((value >> 18) | 0xF0);
  264. *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
  265. *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
  266. *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
  267. }
  268. return out;
  269. }
  270. }; // utf8
  271. template<typename CharType>
  272. struct utf_traits<CharType,2> {
  273. typedef CharType char_type;
  274. // See RFC 2781
  275. static bool is_first_surrogate(uint16_t x)
  276. {
  277. return 0xD800 <=x && x<= 0xDBFF;
  278. }
  279. static bool is_second_surrogate(uint16_t x)
  280. {
  281. return 0xDC00 <=x && x<= 0xDFFF;
  282. }
  283. static code_point combine_surrogate(uint16_t w1,uint16_t w2)
  284. {
  285. return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
  286. }
  287. static int trail_length(char_type c)
  288. {
  289. if(is_first_surrogate(c))
  290. return 1;
  291. if(is_second_surrogate(c))
  292. return -1;
  293. return 0;
  294. }
  295. ///
  296. /// Returns true if c is trail code unit, always false for UTF-32
  297. ///
  298. static bool is_trail(char_type c)
  299. {
  300. return is_second_surrogate(c);
  301. }
  302. ///
  303. /// Returns true if c is lead code unit, always true of UTF-32
  304. ///
  305. static bool is_lead(char_type c)
  306. {
  307. return !is_second_surrogate(c);
  308. }
  309. template<typename It>
  310. static code_point decode(It &current,It last)
  311. {
  312. if(BOOSTER_LOCALE_UNLIKELY(current == last))
  313. return incomplete;
  314. uint16_t w1=*current++;
  315. if(BOOSTER_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
  316. return w1;
  317. }
  318. if(w1 > 0xDBFF)
  319. return illegal;
  320. if(current==last)
  321. return incomplete;
  322. uint16_t w2=*current++;
  323. if(w2 < 0xDC00 || 0xDFFF < w2)
  324. return illegal;
  325. return combine_surrogate(w1,w2);
  326. }
  327. template<typename It>
  328. static code_point decode_valid(It &current)
  329. {
  330. uint16_t w1=*current++;
  331. if(BOOSTER_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) {
  332. return w1;
  333. }
  334. uint16_t w2=*current++;
  335. return combine_surrogate(w1,w2);
  336. }
  337. static const int max_width = 2;
  338. static int width(code_point u)
  339. {
  340. return u>=0x10000 ? 2 : 1;
  341. }
  342. template<typename It>
  343. static It encode(code_point u,It out)
  344. {
  345. if(BOOSTER_LOCALE_LIKELY(u<=0xFFFF)) {
  346. *out++ = static_cast<char_type>(u);
  347. }
  348. else {
  349. u -= 0x10000;
  350. *out++ = static_cast<char_type>(0xD800 | (u>>10));
  351. *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
  352. }
  353. return out;
  354. }
  355. }; // utf16;
  356. template<typename CharType>
  357. struct utf_traits<CharType,4> {
  358. typedef CharType char_type;
  359. static int trail_length(char_type c)
  360. {
  361. if(is_valid_codepoint(c))
  362. return 0;
  363. return -1;
  364. }
  365. static bool is_trail(char_type /*c*/)
  366. {
  367. return false;
  368. }
  369. static bool is_lead(char_type /*c*/)
  370. {
  371. return true;
  372. }
  373. template<typename It>
  374. static code_point decode_valid(It &current)
  375. {
  376. return *current++;
  377. }
  378. template<typename It>
  379. static code_point decode(It &current,It last)
  380. {
  381. if(BOOSTER_LOCALE_UNLIKELY(current == last))
  382. return booster::locale::utf::incomplete;
  383. code_point c=*current++;
  384. if(BOOSTER_LOCALE_UNLIKELY(!is_valid_codepoint(c)))
  385. return booster::locale::utf::illegal;
  386. return c;
  387. }
  388. static const int max_width = 1;
  389. static int width(code_point /*u*/)
  390. {
  391. return 1;
  392. }
  393. template<typename It>
  394. static It encode(code_point u,It out)
  395. {
  396. *out++ = static_cast<char_type>(u);
  397. return out;
  398. }
  399. }; // utf32
  400. #endif
  401. } // utf
  402. } // locale
  403. } // boost
  404. #endif
  405. // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4