ChipMaster's trial hacks on C++CMS starting with v1.2.1. Not sure I'll follow on with the v2 since it looks to be breaking and mostly frivolous.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

671 lines
22 KiB

  1. //
  2. // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0. (See
  5. // accompanying file LICENSE_1_0.txt or copy at
  6. // http://www.boost.org/LICENSE_1_0.txt)
  7. //
  8. #define BOOSTER_SOURCE
  9. #include <booster/locale/generator.h>
  10. #include <booster/locale/encoding.h>
  11. #include "../encoding/conv.h"
  12. #include <booster/locale/util.h>
  13. #ifdef BOOSTER_MSVC
  14. # pragma warning(disable : 4244 4996) // loose data
  15. #endif
  16. #include <cstddef>
  17. #include <string.h>
  18. #include <vector>
  19. #include <algorithm>
  20. //#define DEBUG_CODECVT
  21. #ifdef DEBUG_CODECVT
  22. #include <iostream>
  23. #endif
  24. namespace booster {
  25. namespace locale {
  26. namespace util {
  27. class utf8_converter : public base_converter {
  28. public:
  29. virtual int max_len() const
  30. {
  31. return 4;
  32. }
  33. virtual utf8_converter *clone() const
  34. {
  35. return new utf8_converter();
  36. }
  37. bool is_thread_safe() const
  38. {
  39. return true;
  40. }
  41. virtual uint32_t to_unicode(char const *&begin,char const *end)
  42. {
  43. char const *p=begin;
  44. utf::code_point c = utf::utf_traits<char>::decode(p,end);
  45. if(c==utf::illegal)
  46. return illegal;
  47. if(c==utf::incomplete)
  48. return incomplete;
  49. begin = p;
  50. return c;
  51. }
  52. virtual uint32_t from_unicode(uint32_t u,char *begin,char const *end)
  53. {
  54. if(!utf::is_valid_codepoint(u))
  55. return illegal;
  56. int width = utf::utf_traits<char>::width(u);
  57. ptrdiff_t d=end-begin;
  58. if(d < width)
  59. return incomplete;
  60. utf::utf_traits<char>::encode(u,begin);
  61. return width;
  62. }
  63. }; // utf8_converter
  64. class simple_converter : public base_converter {
  65. public:
  66. virtual ~simple_converter()
  67. {
  68. }
  69. simple_converter(std::string const &encoding)
  70. {
  71. for(unsigned i=0;i<128;i++)
  72. to_unicode_tbl_[i]=i;
  73. for(unsigned i=128;i<256;i++) {
  74. char buf[2] = { char(i) , 0 };
  75. try {
  76. std::wstring const tmp = conv::to_utf<wchar_t>(buf,buf+1,encoding,conv::stop);
  77. if(tmp.size() == 1) {
  78. to_unicode_tbl_[i] = tmp[0];
  79. }
  80. else {
  81. to_unicode_tbl_[i] = illegal;
  82. }
  83. }
  84. catch(conv::conversion_error const &/*e*/) {
  85. to_unicode_tbl_[i] = illegal;
  86. }
  87. }
  88. from_unicode_tbl_.resize(256);
  89. for(unsigned i=0;i<256;i++) {
  90. from_unicode_tbl_[to_unicode_tbl_[i] & 0xFF].push_back(i);
  91. }
  92. }
  93. virtual int max_len() const
  94. {
  95. return 1;
  96. }
  97. virtual bool is_thread_safe() const
  98. {
  99. return true;
  100. }
  101. virtual base_converter *clone() const
  102. {
  103. return new simple_converter(*this);
  104. }
  105. virtual uint32_t to_unicode(char const *&begin,char const *end)
  106. {
  107. if(begin==end)
  108. return incomplete;
  109. unsigned char c = *begin++;
  110. return to_unicode_tbl_[c];
  111. }
  112. virtual uint32_t from_unicode(uint32_t u,char *begin,char const *end)
  113. {
  114. if(begin==end)
  115. return incomplete;
  116. std::vector<unsigned char> const &tbl = from_unicode_tbl_[u & 0xFF];
  117. for(std::vector<unsigned char>::const_iterator p=tbl.begin();p!=tbl.end();++p) {
  118. if(to_unicode_tbl_[*p]==u) {
  119. *begin++ = *p;
  120. return 1;
  121. }
  122. }
  123. return illegal;
  124. }
  125. private:
  126. uint32_t to_unicode_tbl_[256];
  127. std::vector<std::vector<unsigned char> > from_unicode_tbl_;
  128. };
  129. namespace {
  130. char const *simple_encoding_table[] = {
  131. "cp1250",
  132. "cp1251",
  133. "cp1252",
  134. "cp1253",
  135. "cp1254",
  136. "cp1255",
  137. "cp1256",
  138. "cp1257",
  139. "iso88591",
  140. "iso885913",
  141. "iso885915",
  142. "iso88592",
  143. "iso88593",
  144. "iso88594",
  145. "iso88595",
  146. "iso88596",
  147. "iso88597",
  148. "iso88598",
  149. "iso88599",
  150. "koi8r",
  151. "koi8u",
  152. "usascii",
  153. "windows1250",
  154. "windows1251",
  155. "windows1252",
  156. "windows1253",
  157. "windows1254",
  158. "windows1255",
  159. "windows1256",
  160. "windows1257"
  161. };
  162. bool compare_strings(char const *l,char const *r)
  163. {
  164. return strcmp(l,r) < 0;
  165. }
  166. }
  167. std::auto_ptr<base_converter> create_simple_converter(std::string const &encoding)
  168. {
  169. std::auto_ptr<base_converter> res;
  170. std::string norm = conv::impl::normalize_encoding(encoding.c_str());
  171. if(std::binary_search<char const **>( simple_encoding_table,
  172. simple_encoding_table + sizeof(simple_encoding_table)/sizeof(char const *),
  173. norm.c_str(),
  174. compare_strings))
  175. {
  176. res.reset(new simple_converter(encoding));
  177. }
  178. return res;
  179. }
  180. std::auto_ptr<base_converter> create_utf8_converter()
  181. {
  182. std::auto_ptr<base_converter> res(new utf8_converter());
  183. return res;
  184. }
  185. //
  186. // Traits for sizeof char
  187. //
  188. template<typename CharType,int n=sizeof(CharType)>
  189. struct uchar_traits;
  190. template<typename CharType>
  191. struct uchar_traits<CharType,2> {
  192. typedef uint16_t uint_type;
  193. };
  194. template<typename CharType>
  195. struct uchar_traits<CharType,4> {
  196. typedef uint32_t uint_type;
  197. };
  198. // Real codecvt
  199. template<typename CharType>
  200. class code_converter : public std::codecvt<CharType,char,mbstate_t>
  201. {
  202. public:
  203. code_converter(std::auto_ptr<base_converter> cvt,size_t refs = 0) :
  204. std::codecvt<CharType,char,mbstate_t>(refs),
  205. cvt_(cvt)
  206. {
  207. max_len_ = cvt_->max_len();
  208. }
  209. protected:
  210. typedef CharType uchar;
  211. virtual std::codecvt_base::result do_unshift(std::mbstate_t &s,char *from,char * /*to*/,char *&next) const
  212. {
  213. uint16_t &state = *reinterpret_cast<uint16_t *>(&s);
  214. #ifdef DEBUG_CODECVT
  215. std::cout << "Entering unshift " << std::hex << state << std::dec << std::endl;
  216. #endif
  217. if(state != 0)
  218. return std::codecvt_base::error;
  219. next=from;
  220. return std::codecvt_base::ok;
  221. }
  222. virtual int do_encoding() const throw()
  223. {
  224. return 0;
  225. }
  226. virtual int do_max_length() const throw()
  227. {
  228. return max_len_;
  229. }
  230. virtual bool do_always_noconv() const throw()
  231. {
  232. return false;
  233. }
  234. virtual std::codecvt_base::result
  235. do_in( std::mbstate_t &state,
  236. char const *from,
  237. char const *from_end,
  238. char const *&from_next,
  239. uchar *uto,
  240. uchar *uto_end,
  241. uchar *&uto_next) const
  242. {
  243. typedef typename uchar_traits<uchar>::uint_type uint_type;
  244. uint_type *to=reinterpret_cast<uint_type *>(uto);
  245. uint_type *to_end=reinterpret_cast<uint_type *>(uto_end);
  246. uint_type *&to_next=reinterpret_cast<uint_type *&>(uto_next);
  247. return do_real_in(state,from,from_end,from_next,to,to_end,to_next);
  248. }
  249. virtual int
  250. do_length( std::mbstate_t &state,
  251. char const *from,
  252. char const *from_end,
  253. size_t max) const
  254. {
  255. char const *from_next=from;
  256. std::vector<uchar> chrs(max+1);
  257. uchar *to=&chrs.front();
  258. uchar *to_end=to+max;
  259. uchar *to_next=to;
  260. do_in(state,from,from_end,from_next,to,to_end,to_next);
  261. return from_next-from;
  262. }
  263. virtual std::codecvt_base::result
  264. do_out( std::mbstate_t &state,
  265. uchar const *ufrom,
  266. uchar const *ufrom_end,
  267. uchar const *&ufrom_next,
  268. char *to,
  269. char *to_end,
  270. char *&to_next) const
  271. {
  272. typedef typename uchar_traits<uchar>::uint_type uint_type;
  273. uint_type const *from=reinterpret_cast<uint_type const *>(ufrom);
  274. uint_type const *from_end=reinterpret_cast<uint_type const *>(ufrom_end);
  275. uint_type const *&from_next=reinterpret_cast<uint_type const *&>(ufrom_next);
  276. return do_real_out(state,from,from_end,from_next,to,to_end,to_next);
  277. }
  278. private:
  279. //
  280. // Implementation for UTF-32
  281. //
  282. std::codecvt_base::result
  283. do_real_in( std::mbstate_t &/*state*/,
  284. char const *from,
  285. char const *from_end,
  286. char const *&from_next,
  287. uint32_t *to,
  288. uint32_t *to_end,
  289. uint32_t *&to_next) const
  290. {
  291. std::auto_ptr<base_converter> cvtp;
  292. base_converter *cvt = 0;
  293. if(cvt_->is_thread_safe()) {
  294. cvt = cvt_.get();
  295. }
  296. else {
  297. cvtp.reset(cvt_->clone());
  298. cvt = cvtp.get();
  299. }
  300. std::codecvt_base::result r=std::codecvt_base::ok;
  301. while(to < to_end && from < from_end)
  302. {
  303. uint32_t ch=cvt->to_unicode(from,from_end);
  304. if(ch==base_converter::illegal) {
  305. r=std::codecvt_base::error;
  306. break;
  307. }
  308. if(ch==base_converter::incomplete) {
  309. r=std::codecvt_base::partial;
  310. break;
  311. }
  312. *to++=ch;
  313. }
  314. from_next=from;
  315. to_next=to;
  316. if(r!=std::codecvt_base::ok)
  317. return r;
  318. if(from!=from_end)
  319. return std::codecvt_base::partial;
  320. return r;
  321. }
  322. //
  323. // Implementation for UTF-32
  324. //
  325. std::codecvt_base::result
  326. do_real_out(std::mbstate_t &/*state*/, // state is not used there
  327. uint32_t const *from,
  328. uint32_t const *from_end,
  329. uint32_t const *&from_next,
  330. char *to,
  331. char *to_end,
  332. char *&to_next) const
  333. {
  334. std::auto_ptr<base_converter> cvtp;
  335. base_converter *cvt = 0;
  336. if(cvt_->is_thread_safe()) {
  337. cvt = cvt_.get();
  338. }
  339. else {
  340. cvtp.reset(cvt_->clone());
  341. cvt = cvtp.get();
  342. }
  343. std::codecvt_base::result r=std::codecvt_base::ok;
  344. while(to < to_end && from < from_end)
  345. {
  346. uint32_t len=cvt->from_unicode(*from,to,to_end);
  347. if(len==base_converter::illegal) {
  348. r=std::codecvt_base::error;
  349. break;
  350. }
  351. if(len==base_converter::incomplete) {
  352. r=std::codecvt_base::partial;
  353. break;
  354. }
  355. from++;
  356. to+=len;
  357. }
  358. from_next=from;
  359. to_next=to;
  360. if(r!=std::codecvt_base::ok)
  361. return r;
  362. if(from!=from_end)
  363. return std::codecvt_base::partial;
  364. return r;
  365. }
  366. //
  367. // Implementation for UTF-16
  368. //
  369. std::codecvt_base::result
  370. do_real_in( std::mbstate_t &std_state,
  371. char const *from,
  372. char const *from_end,
  373. char const *&from_next,
  374. uint16_t *to,
  375. uint16_t *to_end,
  376. uint16_t *&to_next) const
  377. {
  378. std::auto_ptr<base_converter> cvtp;
  379. base_converter *cvt = 0;
  380. if(cvt_->is_thread_safe()) {
  381. cvt = cvt_.get();
  382. }
  383. else {
  384. cvtp.reset(cvt_->clone());
  385. cvt = cvtp.get();
  386. }
  387. std::codecvt_base::result r=std::codecvt_base::ok;
  388. // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
  389. // according to standard. We use it to keed a flag 0/1 for surrogate pair writing
  390. //
  391. // if 0 no code above >0xFFFF observed, of 1 a code above 0xFFFF observerd
  392. // and first pair is written, but no input consumed
  393. uint16_t &state = *reinterpret_cast<uint16_t *>(&std_state);
  394. while(to < to_end && from < from_end)
  395. {
  396. #ifdef DEBUG_CODECVT
  397. std::cout << "Entering IN--------------" << std::endl;
  398. std::cout << "State " << std::hex << state <<std::endl;
  399. std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
  400. #endif
  401. char const *from_saved = from;
  402. uint32_t ch=cvt->to_unicode(from,from_end);
  403. if(ch==base_converter::illegal) {
  404. r=std::codecvt_base::error;
  405. break;
  406. }
  407. if(ch==base_converter::incomplete) {
  408. r=std::codecvt_base::partial;
  409. break;
  410. }
  411. // Normal codepoints go direcly to stream
  412. if(ch <= 0xFFFF) {
  413. *to++=ch;
  414. }
  415. else {
  416. // for other codepoints we do following
  417. //
  418. // 1. We can't consume our input as we may find ourselfs
  419. // in state where all input consumed but not all output written,i.e. only
  420. // 1st pair is written
  421. // 2. We only write first pair and mark this in the state, we also revert back
  422. // the from pointer in order to make sure this codepoint would be read
  423. // once again and then we would consume our input together with writing
  424. // second surrogate pair
  425. ch-=0x10000;
  426. uint16_t vh = ch >> 10;
  427. uint16_t vl = ch & 0x3FF;
  428. uint16_t w1 = vh + 0xD800;
  429. uint16_t w2 = vl + 0xDC00;
  430. if(state == 0) {
  431. from = from_saved;
  432. *to++ = w1;
  433. state = 1;
  434. }
  435. else {
  436. *to++ = w2;
  437. state = 0;
  438. }
  439. }
  440. }
  441. from_next=from;
  442. to_next=to;
  443. if(r == std::codecvt_base::ok && (from!=from_end || state!=0))
  444. r = std::codecvt_base::partial;
  445. #ifdef DEBUG_CODECVT
  446. std::cout << "Returning ";
  447. switch(r) {
  448. case std::codecvt_base::ok:
  449. std::cout << "ok" << std::endl;
  450. break;
  451. case std::codecvt_base::partial:
  452. std::cout << "partial" << std::endl;
  453. break;
  454. case std::codecvt_base::error:
  455. std::cout << "error" << std::endl;
  456. break;
  457. default:
  458. std::cout << "other" << std::endl;
  459. break;
  460. }
  461. std::cout << "State " << std::hex << state <<std::endl;
  462. std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
  463. #endif
  464. return r;
  465. }
  466. //encoding// Implementation for UTF-16
  467. //
  468. std::codecvt_base::result
  469. do_real_out(std::mbstate_t &std_state,
  470. uint16_t const *from,
  471. uint16_t const *from_end,
  472. uint16_t const *&from_next,
  473. char *to,
  474. char *to_end,
  475. char *&to_next) const
  476. {
  477. std::auto_ptr<base_converter> cvtp;
  478. base_converter *cvt = 0;
  479. if(cvt_->is_thread_safe()) {
  480. cvt = cvt_.get();
  481. }
  482. else {
  483. cvtp.reset(cvt_->clone());
  484. cvt = cvtp.get();
  485. }
  486. std::codecvt_base::result r=std::codecvt_base::ok;
  487. // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
  488. // according to standard. We assume that sizeof(mbstate_t) >=2 in order
  489. // to be able to store first observerd surrogate pair
  490. //
  491. // State: state!=0 - a first surrogate pair was observerd (state = first pair),
  492. // we expect the second one to come and then zero the state
  493. ///
  494. uint16_t &state = *reinterpret_cast<uint16_t *>(&std_state);
  495. while(to < to_end && from < from_end)
  496. {
  497. #ifdef DEBUG_CODECVT
  498. std::cout << "Entering OUT --------------" << std::endl;
  499. std::cout << "State " << std::hex << state <<std::endl;
  500. std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
  501. #endif
  502. uint32_t ch=0;
  503. if(state != 0) {
  504. // if the state idecates that 1st surrogate pair was written
  505. // we should make sure that the second one that comes is actually
  506. // second surrogate
  507. uint16_t w1 = state;
  508. uint16_t w2 = *from;
  509. // we don't forward from as writing may fail to incomplete or
  510. // partial conversion
  511. if(0xDC00 <= w2 && w2<=0xDFFF) {
  512. uint16_t vh = w1 - 0xD800;
  513. uint16_t vl = w2 - 0xDC00;
  514. ch=((uint32_t(vh) << 10) | vl) + 0x10000;
  515. }
  516. else {
  517. // Invalid surrogate
  518. r=std::codecvt_base::error;
  519. break;
  520. }
  521. }
  522. else {
  523. ch = *from;
  524. if(0xD800 <= ch && ch<=0xDBFF) {
  525. // if this is a first surrogate pair we put
  526. // it into the state and consume it, note we don't
  527. // go forward as it should be illegal so we increase
  528. // the from pointer manually
  529. state = ch;
  530. from++;
  531. continue;
  532. }
  533. else if(0xDC00 <= ch && ch<=0xDFFF) {
  534. // if we observe second surrogate pair and
  535. // first only may be expected we should break from the loop with error
  536. // as it is illegal input
  537. r=std::codecvt_base::error;
  538. break;
  539. }
  540. }
  541. uint32_t len=cvt->from_unicode(ch,to,to_end);
  542. if(len==base_converter::illegal) {
  543. r=std::codecvt_base::error;
  544. break;
  545. }
  546. if(len==base_converter::incomplete) {
  547. r=std::codecvt_base::partial;
  548. break;
  549. }
  550. state = 0;
  551. to+=len;
  552. from++;
  553. }
  554. from_next=from;
  555. to_next=to;
  556. if(r==std::codecvt_base::ok && from!=from_end)
  557. r = std::codecvt_base::partial;
  558. #ifdef DEBUG_CODECVT
  559. std::cout << "Returning ";
  560. switch(r) {
  561. case std::codecvt_base::ok:
  562. std::cout << "ok" << std::endl;
  563. break;
  564. case std::codecvt_base::partial:
  565. std::cout << "partial" << std::endl;
  566. break;
  567. case std::codecvt_base::error:
  568. std::cout << "error" << std::endl;
  569. break;
  570. default:
  571. std::cout << "other" << std::endl;
  572. break;
  573. }
  574. std::cout << "State " << std::hex << state <<std::endl;
  575. std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
  576. #endif
  577. return r;
  578. }
  579. int max_len_;
  580. std::auto_ptr<base_converter> cvt_;
  581. };
  582. BOOSTER_UNUSED static const char ensure_mbstate_size_is_at_least_2[sizeof(mbstate_t) >= 2 ? 1 : -1] = {0};
  583. template<>
  584. class code_converter<char> : public std::codecvt<char,char,mbstate_t>
  585. {
  586. public:
  587. code_converter(std::auto_ptr<base_converter> /*cvt*/,size_t refs = 0) :
  588. std::codecvt<char,char,mbstate_t>(refs)
  589. {
  590. }
  591. };
  592. std::locale create_codecvt(std::locale const &in,std::auto_ptr<base_converter> cvt,character_facet_type type)
  593. {
  594. if(!cvt.get())
  595. cvt.reset(new base_converter());
  596. switch(type) {
  597. case char_facet:
  598. return std::locale(in,new code_converter<char>(cvt));
  599. case wchar_t_facet:
  600. return std::locale(in,new code_converter<wchar_t>(cvt));
  601. #if defined(BOOSTER_HAS_CHAR16_T) && !defined(BOOSTER_NO_CHAR16_T_CODECVT)
  602. case char16_t_facet:
  603. return std::locale(in,new code_converter<char16_t>(cvt));
  604. #endif
  605. #if defined(BOOSTER_HAS_CHAR32_T) && !defined(BOOSTER_NO_CHAR32_T_CODECVT)
  606. case char32_t_facet:
  607. return std::locale(in,new code_converter<char32_t>(cvt));
  608. #endif
  609. default:
  610. return in;
  611. }
  612. }
  613. } // util
  614. } // locale
  615. } // boost
  616. // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4