ChipMaster's trial hacks on C++CMS starting with v1.2.1. Not sure I'll follow on with the v2 since it looks to be breaking and mostly frivolous.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

535 lines
15 KiB

  1. #define CPPCMS_SOURCE
  2. #include "encoding_validators.h"
  3. #include "encoding.h"
  4. #include "utf_iterator.h"
  5. #include "cppcms_error.h"
  6. #include "localization.h"
  7. #include "config.h"
  8. #include <errno.h>
  9. #include <string>
  10. #include <stdexcept>
  11. #include <iostream>
  12. #ifdef HAVE_ICONV
  13. #include <iconv.h>
  14. #else
  15. #include <unicode/ucnv.h> // For Win32
  16. #endif
  17. namespace cppcms {
  18. namespace encoding {
  19. namespace impl{
  20. #ifdef HAVE_ICONV
  21. typedef size_t (*posix_iconv_type)(iconv_t cd,char **, size_t *t,char **,size_t *);
  22. typedef size_t (*gnu_iconv_type)(iconv_t cd,char const **, size_t *t,char **,size_t *);
  23. size_t do_iconv(posix_iconv_type cv,iconv_t cd,char const **inbuf, size_t *inbytesleft,char **outbuf,size_t *outbytesleft)
  24. {
  25. return cv(cd,const_cast<char **>(inbuf),inbytesleft,outbuf,outbytesleft);
  26. }
  27. size_t do_iconv(gnu_iconv_type cv,iconv_t cd,char const **inbuf, size_t *inbytesleft,char **outbuf,size_t *outbytesleft)
  28. {
  29. return cv(cd,inbuf,inbytesleft,outbuf,outbytesleft);
  30. }
  31. char const *native_utf32_encoding()
  32. {
  33. char const *le="UTF-32LE";
  34. char const *be="UTF-32BE";
  35. uint16_t v=0x0a0b;
  36. char const *utf=*(char*)&v== 0x0a ? be : le;
  37. return utf;
  38. }
  39. char const *native_utf16_encoding()
  40. {
  41. char const *le="UTF-16LE";
  42. char const *be="UTF-16BE";
  43. uint16_t v=0x0a0b;
  44. char const *utf=*(char*)&v== 0x0a ? be : le;
  45. return utf;
  46. }
  47. char const *native_wchar_encoding()
  48. {
  49. if(sizeof(wchar_t)==4)
  50. return native_utf32_encoding();
  51. if(sizeof(wchar_t)==2)
  52. return native_utf16_encoding();
  53. throw std::runtime_error("wchar_t does not support unicode!");
  54. }
  55. class iconv_validator {
  56. public:
  57. iconv_validator(std::string const &charset) :
  58. descriptor_((iconv_t)(-1))
  59. {
  60. descriptor_=iconv_open(native_utf32_encoding(),charset.c_str());
  61. if(descriptor_==(iconv_t)(-1)) {
  62. throw std::runtime_error("Failed to load iconv tables for:" + charset);
  63. }
  64. }
  65. ~iconv_validator()
  66. {
  67. if(descriptor_!=(iconv_t)(-1))
  68. iconv_close(descriptor_);
  69. }
  70. bool valid(char const *begin,char const *end,size_t &count)
  71. {
  72. iconv(descriptor_,0,0,0,0); // reset
  73. uint32_t buffer[64];
  74. size_t input=end-begin;
  75. while(begin!=end) {
  76. uint32_t *output=buffer;
  77. size_t outsize=sizeof(buffer);
  78. size_t res=do_iconv(::iconv,descriptor_,&begin,&input,(char**)&output,&outsize);
  79. if(res==(size_t)(-1) && errno==E2BIG) {
  80. if(!check_symbols(buffer,output,count))
  81. return false;
  82. continue;
  83. }
  84. if(res!=(size_t)(-1) && input==0)
  85. return check_symbols(buffer,output,count);
  86. return false;
  87. }
  88. return true;
  89. }
  90. private:
  91. iconv_t descriptor_;
  92. iconv_validator(iconv_validator const &other);
  93. iconv_validator const &operator=(iconv_validator const &);
  94. bool check_symbols(uint32_t const *begin,uint32_t const *end,size_t &count)
  95. {
  96. while(begin!=end) {
  97. uint32_t c=*begin++;
  98. count++;
  99. if(c==0x09 || c==0xA || c==0xD)
  100. continue;
  101. if(c<0x20 || (0x7F<=c && c<=0x9F))
  102. return false;
  103. }
  104. return true;
  105. }
  106. };
  107. typedef iconv_validator validator;
  108. #else // NO HAVE_ICONV
  109. class uconv_validator {
  110. public:
  111. uconv_validator(std::string const &charset) :
  112. uconv_(0)
  113. {
  114. UErrorCode err=U_ZERO_ERROR;
  115. uconv_=ucnv_open(charset.c_str(),&err);
  116. if(!uconv_)
  117. throw cppcms_error("Invalid encoding:" + charset + u_errorName(err));
  118. err=U_ZERO_ERROR;
  119. ucnv_setToUCallBack(uconv_,UCNV_TO_U_CALLBACK_STOP,0,0,0,&err);
  120. if(U_FAILURE(err)) {
  121. ucnv_close(uconv_);
  122. throw cppcms_error("Invalid encoding:" + charset + u_errorName(err));
  123. }
  124. }
  125. ~uconv_validator()
  126. {
  127. if(uconv_) ucnv_close(uconv_);
  128. }
  129. bool valid(char const *begin,char const *end,size_t &count)
  130. {
  131. UChar buffer[64];
  132. UChar *ubegin=buffer;
  133. UChar *uend=ubegin+64;
  134. count = 0;
  135. while(begin!=end) {
  136. UErrorCode err=U_ZERO_ERROR;
  137. ucnv_toUnicode(uconv_,&ubegin,uend,&begin,end,0,1,&err);
  138. if(err==U_BUFFER_OVERFLOW_ERROR) {
  139. if(!check_symbols(buffer,ubegin,count))
  140. return false;
  141. }
  142. else if(U_FAILURE(err))
  143. return false;
  144. }
  145. return true;
  146. }
  147. private:
  148. UConverter *uconv_;
  149. uconv_validator(uconv_validator const &other);
  150. void operator=(uconv_validator const &);
  151. bool check_symbols(UChar const *begin,UChar const *end,size_t &count)
  152. {
  153. while(begin!=end) {
  154. UChar c=*begin++;
  155. if(!U_IS_SURROGATE(c) || U_IS_SURROGATE_LEAD(c))
  156. count++;
  157. if(c==0x09 || c==0xA || c==0xD)
  158. continue;
  159. if(c<0x20 || (0x7F<=c && c<=0x9F))
  160. return false;
  161. }
  162. return true;
  163. }
  164. };
  165. typedef uconv_validator validator;
  166. #endif
  167. struct validators_set {
  168. typedef bool (*encoding_tester_type)(char const *begin,char const *end,size_t &count);
  169. encoding_tester_type get(char const *str) const
  170. {
  171. std::string name=str;
  172. for(unsigned i=0;i<name.size();i++)
  173. if('A' <= name[i] && name[i] <= 'Z')
  174. name[i]-=('A'-'a');
  175. std::map<std::string,encoding_tester_type>::const_iterator p;
  176. p=predefined_.find(name);
  177. if(p==predefined_.end())
  178. return 0;
  179. return p->second;
  180. }
  181. validators_set()
  182. {
  183. encoding_tester_type iso_tester=&iso_8859_1_2_4_5_9_10_13_14_15_16_valid<char const *>;
  184. predefined_["latin1"]=iso_tester;
  185. predefined_["iso88591"]=iso_tester;
  186. predefined_["iso88592"]=iso_tester;
  187. predefined_["iso88594"]=iso_tester;
  188. predefined_["iso88595"]=iso_tester;
  189. predefined_["iso88599"]=iso_tester;
  190. predefined_["iso885910"]=iso_tester;
  191. predefined_["iso885913"]=iso_tester;
  192. predefined_["iso885914"]=iso_tester;
  193. predefined_["iso885915"]=iso_tester;
  194. predefined_["iso885916"]=iso_tester;
  195. predefined_["8859_1"]=iso_tester;
  196. predefined_["8859_2"]=iso_tester;
  197. predefined_["8859_4"]=iso_tester;
  198. predefined_["8859_5"]=iso_tester;
  199. predefined_["8859_9"]=iso_tester;
  200. predefined_["8859_10"]=iso_tester;
  201. predefined_["8859_13"]=iso_tester;
  202. predefined_["8859_14"]=iso_tester;
  203. predefined_["8859_15"]=iso_tester;
  204. predefined_["8859_16"]=iso_tester;
  205. predefined_["iso8859-1"]=iso_tester;
  206. predefined_["iso8859-2"]=iso_tester;
  207. predefined_["iso8859-4"]=iso_tester;
  208. predefined_["iso8859-5"]=iso_tester;
  209. predefined_["iso8859-9"]=iso_tester;
  210. predefined_["iso8859-10"]=iso_tester;
  211. predefined_["iso8859-13"]=iso_tester;
  212. predefined_["iso8859-14"]=iso_tester;
  213. predefined_["iso8859-15"]=iso_tester;
  214. predefined_["iso8859-16"]=iso_tester;
  215. predefined_["iso_8859-1"]=iso_tester;
  216. predefined_["iso_8859-2"]=iso_tester;
  217. predefined_["iso_8859-4"]=iso_tester;
  218. predefined_["iso_8859-5"]=iso_tester;
  219. predefined_["iso_8859-9"]=iso_tester;
  220. predefined_["iso_8859-10"]=iso_tester;
  221. predefined_["iso_8859-13"]=iso_tester;
  222. predefined_["iso_8859-14"]=iso_tester;
  223. predefined_["iso_8859-15"]=iso_tester;
  224. predefined_["iso_8859-16"]=iso_tester;
  225. predefined_["iso-8859-1"]=iso_tester;
  226. predefined_["iso-8859-2"]=iso_tester;
  227. predefined_["iso-8859-4"]=iso_tester;
  228. predefined_["iso-8859-5"]=iso_tester;
  229. predefined_["iso-8859-9"]=iso_tester;
  230. predefined_["iso-8859-10"]=iso_tester;
  231. predefined_["iso-8859-13"]=iso_tester;
  232. predefined_["iso-8859-14"]=iso_tester;
  233. predefined_["iso-8859-15"]=iso_tester;
  234. predefined_["iso-8859-16"]=iso_tester;
  235. predefined_["iso88593"]=&iso_8859_3_valid<char const *>;
  236. predefined_["iso88596"]=&iso_8859_6_valid<char const *>;
  237. predefined_["iso88597"]=&iso_8859_7_valid<char const *>;
  238. predefined_["iso88598"]=&iso_8859_8_valid<char const *>;
  239. predefined_["iso885911"]=&iso_8859_11_valid<char const *>;
  240. predefined_["iso8859-3"]=&iso_8859_3_valid<char const *>;
  241. predefined_["iso8859-6"]=&iso_8859_6_valid<char const *>;
  242. predefined_["iso8859-7"]=&iso_8859_7_valid<char const *>;
  243. predefined_["iso8859-8"]=&iso_8859_8_valid<char const *>;
  244. predefined_["iso8859-11"]=&iso_8859_11_valid<char const *>;
  245. predefined_["8859_3"]=&iso_8859_3_valid<char const *>;
  246. predefined_["8859_6"]=&iso_8859_6_valid<char const *>;
  247. predefined_["8859_7"]=&iso_8859_7_valid<char const *>;
  248. predefined_["8859_8"]=&iso_8859_8_valid<char const *>;
  249. predefined_["8859_11"]=&iso_8859_11_valid<char const *>;
  250. predefined_["iso_8859-3"]=&iso_8859_3_valid<char const *>;
  251. predefined_["iso_8859-6"]=&iso_8859_6_valid<char const *>;
  252. predefined_["iso_8859-7"]=&iso_8859_7_valid<char const *>;
  253. predefined_["iso_8859-8"]=&iso_8859_8_valid<char const *>;
  254. predefined_["iso_8859-11"]=&iso_8859_11_valid<char const *>;
  255. predefined_["iso-8859-3"]=&iso_8859_3_valid<char const *>;
  256. predefined_["iso-8859-6"]=&iso_8859_6_valid<char const *>;
  257. predefined_["iso-8859-7"]=&iso_8859_7_valid<char const *>;
  258. predefined_["iso-8859-8"]=&iso_8859_8_valid<char const *>;
  259. predefined_["iso-8859-11"]=&iso_8859_11_valid<char const *>;
  260. predefined_["windows-1250"]=&windows_1250_valid<char const *>;
  261. predefined_["windows-1251"]=&windows_1251_valid<char const *>;
  262. predefined_["windows-1252"]=&windows_1252_valid<char const *>;
  263. predefined_["windows-1253"]=&windows_1253_valid<char const *>;
  264. predefined_["windows-1255"]=&windows_1255_valid<char const *>;
  265. predefined_["windows-1256"]=&windows_1256_valid<char const *>;
  266. predefined_["windows-1257"]=&windows_1257_valid<char const *>;
  267. predefined_["windows-1258"]=&windows_1258_valid<char const *>;
  268. predefined_["cp1250"]=&windows_1250_valid<char const *>;
  269. predefined_["cp1251"]=&windows_1251_valid<char const *>;
  270. predefined_["cp1252"]=&windows_1252_valid<char const *>;
  271. predefined_["cp1253"]=&windows_1253_valid<char const *>;
  272. predefined_["cp1255"]=&windows_1255_valid<char const *>;
  273. predefined_["cp1256"]=&windows_1256_valid<char const *>;
  274. predefined_["cp1257"]=&windows_1257_valid<char const *>;
  275. predefined_["cp1258"]=&windows_1258_valid<char const *>;
  276. predefined_["1250"]=&windows_1250_valid<char const *>;
  277. predefined_["1251"]=&windows_1251_valid<char const *>;
  278. predefined_["1252"]=&windows_1252_valid<char const *>;
  279. predefined_["1253"]=&windows_1253_valid<char const *>;
  280. predefined_["1255"]=&windows_1255_valid<char const *>;
  281. predefined_["1256"]=&windows_1256_valid<char const *>;
  282. predefined_["1257"]=&windows_1257_valid<char const *>;
  283. predefined_["1258"]=&windows_1258_valid<char const *>;
  284. predefined_["koi8r"]=predefined_["koi8-r"]=&koi8_valid<char const *>;
  285. predefined_["koi8u"]=predefined_["koi8-u"]=&koi8_valid<char const *>;
  286. predefined_["utf8"]=predefined_["utf-8"]=&utf8_valid<char const *>;
  287. predefined_["us-ascii"]=predefined_["ascii"]=&ascii_valid<char const *>;
  288. }
  289. private:
  290. std::map<std::string,encoding_tester_type> predefined_;
  291. } all_validators;
  292. } // impl
  293. bool CPPCMS_API valid_utf8(char const *begin,char const *end,size_t &count)
  294. {
  295. return utf8_valid(begin,end,count);
  296. }
  297. bool CPPCMS_API valid(std::string const &encoding,char const *begin,char const *end,size_t &count)
  298. {
  299. return valid(encoding.c_str(),begin,end,count);
  300. }
  301. bool CPPCMS_API valid(std::locale const &loc,char const *begin,char const *end,size_t &count)
  302. {
  303. return valid(std::use_facet<locale::info>(loc).encoding().c_str(),begin,end,count);
  304. }
  305. bool CPPCMS_API valid(char const *encoding,char const *begin,char const *end,size_t &count)
  306. {
  307. impl::validators_set::encoding_tester_type tester = impl::all_validators.get(encoding);
  308. if(tester)
  309. return tester(begin,end,count);
  310. try {
  311. impl::validator vtester(encoding);
  312. return vtester.valid(begin,end,count);
  313. }
  314. catch(std::runtime_error const &e) {
  315. return false;
  316. }
  317. }
  318. inline bool is_utf8(char const *c_encoding)
  319. {
  320. return strcmp(c_encoding,"UTF8")==0
  321. || strcmp(c_encoding,"UTF-8")==0
  322. || strcmp(c_encoding,"utf8")==0
  323. || strcmp(c_encoding,"utf-8")==0;
  324. }
  325. #ifdef HAVE_ICONV
  326. namespace impl {
  327. std::string iconv_convert_to(char const *to,char const *from,char const *begin,char const *end)
  328. {
  329. iconv_t d=iconv_open(to,from);
  330. if(d==(iconv_t)(-1))
  331. throw cppcms_error("Unsupported encoding "+std::string(to));
  332. std::string result;
  333. try {
  334. char buffer[256];
  335. size_t input=end-begin;
  336. while(begin!=end) {
  337. char *output=buffer;
  338. size_t outsize=sizeof(buffer);
  339. size_t res=do_iconv(::iconv,d,&begin,&input,(char**)&output,&outsize);
  340. if(res==(size_t)(-1) && errno==E2BIG) {
  341. result.append(buffer,output);
  342. continue;
  343. }
  344. if(res!=(size_t)(-1) && input==0) {
  345. result.append(buffer,output);
  346. break;
  347. }
  348. break;
  349. }
  350. }
  351. catch(...) {
  352. iconv_close(d);
  353. }
  354. iconv_close(d);
  355. return result;
  356. }
  357. } // impl
  358. #endif
  359. std::string CPPCMS_API to_utf8(char const *c_encoding,char const *begin,char const *end)
  360. {
  361. std::string result;
  362. if(is_utf8(c_encoding)) {
  363. result.assign(begin,end-begin);
  364. return result;
  365. }
  366. #ifdef HAVE_ICONV
  367. return impl::iconv_convert_to("UTF-8",c_encoding,begin,end);
  368. #else // USE ICU
  369. locale::details::converter cvt(c_encoding);
  370. result.reserve(end-begin);
  371. std::vector<char> buf(cvt.max_len() * 64);
  372. while(begin<end) {
  373. uint32_t u=utf8::next(begin,end,false,true);
  374. if(u > 0x10FFFF) // error
  375. return result;
  376. char *tbegin=&buf[0];
  377. char *tend=tbegin+buf.size();
  378. uint32_t n = cvt.from_unicode(u ,tbegin,tend);
  379. if(n != locale::details::converter::illegal && n!= locale::details::converter::incomplete)
  380. result.append(tbegin,tbegin+n);
  381. else
  382. return result;
  383. }
  384. return result;
  385. #endif
  386. }
  387. std::string CPPCMS_API to_utf8(char const *encoding,std::string const &str)
  388. {
  389. if(is_utf8(encoding))
  390. return str;
  391. return to_utf8(encoding,str.data(),str.data()+str.size());
  392. }
  393. std::string CPPCMS_API to_utf8(std::locale const &loc,char const *begin,char const *end)
  394. {
  395. locale::info const &inf = std::use_facet<locale::info>(loc);
  396. if(inf.utf8())
  397. return std::string(begin,end-begin);
  398. else
  399. return to_utf8(inf.encoding().c_str(),begin,end);
  400. }
  401. std::string CPPCMS_API to_utf8(std::locale const &loc,std::string const &str)
  402. {
  403. locale::info const &inf = std::use_facet<locale::info>(loc);
  404. if(inf.utf8())
  405. return str;
  406. else
  407. return to_utf8(inf.encoding().c_str(),str);
  408. }
  409. ////////////////////
  410. // FROM
  411. ///////////////////
  412. std::string CPPCMS_API from_utf8(char const *c_encoding,char const *begin,char const *end)
  413. {
  414. std::string result;
  415. if(is_utf8(c_encoding)) {
  416. result.assign(begin,end-begin);
  417. return result;
  418. }
  419. #ifdef HAVE_ICONV
  420. return impl::iconv_convert_to(c_encoding,"UTF-8",begin,end);
  421. #else // USE ICU
  422. locale::details::converter cvt(c_encoding);
  423. result.reserve(end-begin);
  424. while(begin<end) {
  425. uint32_t u=cvt.to_unicode(begin,end);
  426. if(u > 0x10FFFF) // error
  427. return result;
  428. utf8::seq s=utf8::encode(u);
  429. result.append(s.c,s.len);
  430. }
  431. return result;
  432. #endif
  433. }
  434. std::string CPPCMS_API from_utf8(char const *encoding,std::string const &str)
  435. {
  436. if(is_utf8(encoding))
  437. return str;
  438. return from_utf8(encoding,str.data(),str.data()+str.size());
  439. }
  440. std::string CPPCMS_API from_utf8(std::locale const &loc,char const *begin,char const *end)
  441. {
  442. locale::info const &inf = std::use_facet<locale::info>(loc);
  443. if(inf.utf8())
  444. return std::string(begin,end-begin);
  445. else
  446. return from_utf8(inf.encoding().c_str(),begin,end);
  447. }
  448. std::string CPPCMS_API from_utf8(std::locale const &loc,std::string const &str)
  449. {
  450. locale::info const &inf = std::use_facet<locale::info>(loc);
  451. if(inf.utf8())
  452. return str;
  453. else
  454. return from_utf8(inf.encoding().c_str(),str);
  455. }
  456. } } // cppcms::encoding