diff options
Diffstat (limited to 'LibOVRKernel/Src/Kernel/OVR_UTF8Util.cpp')
-rw-r--r-- | LibOVRKernel/Src/Kernel/OVR_UTF8Util.cpp | 556 |
1 files changed, 556 insertions, 0 deletions
diff --git a/LibOVRKernel/Src/Kernel/OVR_UTF8Util.cpp b/LibOVRKernel/Src/Kernel/OVR_UTF8Util.cpp new file mode 100644 index 0000000..68e58ea --- /dev/null +++ b/LibOVRKernel/Src/Kernel/OVR_UTF8Util.cpp @@ -0,0 +1,556 @@ +/************************************************************************** + +Filename : OVR_UTF8Util.cpp +Content : UTF8 Unicode character encoding/decoding support +Created : September 19, 2012 +Notes : +Notes : Much useful info at "UTF-8 and Unicode FAQ" + http://www.cl.cam.ac.uk/~mgk25/unicode.html + +Copyright : Copyright 2014 Oculus VR, LLC All Rights reserved. + +Licensed under the Oculus VR Rift SDK License Version 3.2 (the "License"); +you may not use the Oculus VR Rift SDK except in compliance with the License, +which is provided at the time of installation or download, or which +otherwise accompanies this software in either electronic or hard copy form. + +You may obtain a copy of the License at + +http://www.oculusvr.com/licenses/LICENSE-3.2 + +Unless required by applicable law or agreed to in writing, the Oculus VR SDK +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +************************************************************************************/ + +#include "OVR_UTF8Util.h" + +namespace OVR { namespace UTF8Util { + +intptr_t OVR_STDCALL GetLength(const char* buf, intptr_t buflen) +{ + const char* p = buf; + intptr_t length = 0; + + if (buflen != -1) + { + while (p - buf < buflen) + { + // We should be able to have ASStrings with 0 in the middle. + UTF8Util::DecodeNextChar_Advance0(&p); + length++; + } + } + else + { + while (UTF8Util::DecodeNextChar_Advance0(&p)) + length++; + } + + return length; +} + +uint32_t OVR_STDCALL GetCharAt(intptr_t index, const char* putf8str, intptr_t length) +{ + const char* buf = putf8str; + uint32_t c = 0; + + if (length != -1) + { + while (buf - putf8str < length) + { + c = UTF8Util::DecodeNextChar_Advance0(&buf); + if (index == 0) + return c; + index--; + } + + return c; + } + + do + { + c = UTF8Util::DecodeNextChar_Advance0(&buf); + index--; + + if (c == 0) + { + // We've hit the end of the string; don't go further. + OVR_ASSERT(index == 0); + return c; + } + } while (index >= 0); + + return c; +} + +intptr_t OVR_STDCALL GetByteIndex(intptr_t index, const char *putf8str, intptr_t length) +{ + const char* buf = putf8str; + + if (length != -1) + { + while ((buf - putf8str) < length && index > 0) + { + UTF8Util::DecodeNextChar_Advance0(&buf); + index--; + } + + return buf-putf8str; + } + + while (index > 0) + { + uint32_t c = UTF8Util::DecodeNextChar_Advance0(&buf); + index--; + + if (c == 0) + return buf-putf8str; + }; + + return buf-putf8str; +} + +int OVR_STDCALL GetEncodeCharSize(uint32_t ucs_character) +{ + if (ucs_character <= 0x7F) + return 1; + else if (ucs_character <= 0x7FF) + return 2; + else if (ucs_character <= 0xFFFF) + return 3; + else if (ucs_character <= 0x1FFFFF) + return 4; + else if (ucs_character <= 0x3FFFFFF) + return 5; + else if (ucs_character <= 0x7FFFFFFF) + return 6; + else + return 0; +} + +uint32_t OVR_STDCALL DecodeNextChar_Advance0(const char** putf8Buffer) +{ + uint32_t uc; + char c; + + // Security considerations: + // + // Changed, this is now only the case for DecodeNextChar: + // - If we hit a zero byte, we want to return 0 without stepping + // the buffer pointer past the 0. th + // + // If we hit an "overlong sequence"; i.e. a character encoded + // in a longer multibyte string than is necessary, then we + // need to discard the character. This is so attackers can't + // disguise dangerous characters or character sequences -- + // there is only one valid encoding for each character. + // + // If we decode characters { 0xD800 .. 0xDFFF } or { 0xFFFE, + // 0xFFFF } then we ignore them; they are not valid in UTF-8. + + // This isn't actually an invalid character; it's a valid char that + // looks like an inverted question mark. +#define INVALID_CHAR 0x0FFFD + +#define FIRST_BYTE(mask, shift) \ + uc = (c & (mask)) << (shift); + +#define NEXT_BYTE(shift) \ + c = **putf8Buffer; \ + if (c == 0) return 0; /* end of buffer, do not advance */ \ + if ((c & 0xC0) != 0x80) return INVALID_CHAR; /* standard check */ \ + (*putf8Buffer)++; \ + uc |= (c & 0x3F) << shift; + + c = **putf8Buffer; + (*putf8Buffer)++; + if (c == 0) + return 0; // End of buffer. + + if ((c & 0x80) == 0) return (uint32_t) c; // Conventional 7-bit ASCII. + + // Multi-byte sequences. + if ((c & 0xE0) == 0xC0) + { + // Two-byte sequence. + FIRST_BYTE(0x1F, 6); + NEXT_BYTE(0); + if (uc < 0x80) return INVALID_CHAR; // overlong + return uc; + } + else if ((c & 0xF0) == 0xE0) + { + // Three-byte sequence. + FIRST_BYTE(0x0F, 12); + NEXT_BYTE(6); + NEXT_BYTE(0); + if (uc < 0x800) return INVALID_CHAR; // overlong + // Not valid ISO 10646, but Flash requires these to work + // see AS3 test e15_5_3_2_3 for String.fromCharCode().charCodeAt(0) + // if (uc >= 0x0D800 && uc <= 0x0DFFF) return INVALID_CHAR; + // if (uc == 0x0FFFE || uc == 0x0FFFF) return INVALID_CHAR; // not valid ISO 10646 + return uc; + } + else if ((c & 0xF8) == 0xF0) + { + // Four-byte sequence. + FIRST_BYTE(0x07, 18); + NEXT_BYTE(12); + NEXT_BYTE(6); + NEXT_BYTE(0); + if (uc < 0x010000) return INVALID_CHAR; // overlong + return uc; + } + else if ((c & 0xFC) == 0xF8) + { + // Five-byte sequence. + FIRST_BYTE(0x03, 24); + NEXT_BYTE(18); + NEXT_BYTE(12); + NEXT_BYTE(6); + NEXT_BYTE(0); + if (uc < 0x0200000) return INVALID_CHAR; // overlong + return uc; + } + else if ((c & 0xFE) == 0xFC) + { + // Six-byte sequence. + FIRST_BYTE(0x01, 30); + NEXT_BYTE(24); + NEXT_BYTE(18); + NEXT_BYTE(12); + NEXT_BYTE(6); + NEXT_BYTE(0); + if (uc < 0x04000000) return INVALID_CHAR; // overlong + return uc; + } + else + { + // Invalid. + return INVALID_CHAR; + } +} + + +void OVR_STDCALL EncodeChar(char* pbuffer, intptr_t* pindex, uint32_t ucs_character) +{ + if (ucs_character <= 0x7F) + { + // Plain single-byte ASCII. + pbuffer[(*pindex)++] = (char) ucs_character; + } + else if (ucs_character <= 0x7FF) + { + // Two bytes. + pbuffer[(*pindex)++] = 0xC0 | (char)(ucs_character >> 6); + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F); + } + else if (ucs_character <= 0xFFFF) + { + // Three bytes. + pbuffer[(*pindex)++] = 0xE0 | (char)(ucs_character >> 12); + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F); + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F); + } + else if (ucs_character <= 0x1FFFFF) + { + // Four bytes. + pbuffer[(*pindex)++] = 0xF0 | (char)(ucs_character >> 18); + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F); + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F); + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F); + } + else if (ucs_character <= 0x3FFFFFF) + { + // Five bytes. + pbuffer[(*pindex)++] = 0xF8 | (char)(ucs_character >> 24); + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F); + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F); + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F); + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F); + } + else if (ucs_character <= 0x7FFFFFFF) + { + // Six bytes. + pbuffer[(*pindex)++] = 0xFC | (char)(ucs_character >> 30); + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 24) & 0x3F); + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F); + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F); + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F); + pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F); + } + else + { + // Invalid char; don't encode anything. + } +} + +intptr_t OVR_STDCALL GetEncodeStringSize(const wchar_t* pchar, intptr_t length) +{ + intptr_t len = 0; + if (length != -1) + for (int i = 0; i < length; i++) + { + len += GetEncodeCharSize(pchar[i]); + } + else + for (int i = 0;; i++) + { + if (pchar[i] == 0) + return len; + len += GetEncodeCharSize(pchar[i]); + } + return len; +} + +void OVR_STDCALL EncodeString(char *pbuff, const wchar_t* pchar, intptr_t length) +{ + intptr_t ofs = 0; + if (length != -1) + { + for (int i = 0; i < length; i++) + { + EncodeChar(pbuff, &ofs, pchar[i]); + } + } + else + { + for (int i = 0;; i++) + { + if (pchar[i] == 0) + break; + EncodeChar(pbuff, &ofs, pchar[i]); + } + } + pbuff[ofs] = 0; +} + +size_t OVR_STDCALL DecodeString(wchar_t *pbuff, const char* putf8str, intptr_t bytesLen) +{ + wchar_t *pbegin = pbuff; + if (bytesLen == -1) + { + while (1) + { + uint32_t ch = DecodeNextChar_Advance0(&putf8str); + if (ch == 0) + break; + else if (ch >= 0xFFFF) + ch = 0xFFFD; + *pbuff++ = wchar_t(ch); + } + } + else + { + const char* p = putf8str; + while ((p - putf8str) < bytesLen) + { + uint32_t ch = DecodeNextChar_Advance0(&p); + if (ch >= 0xFFFF) + ch = 0xFFFD; + *pbuff++ = wchar_t(ch); + } + } + + *pbuff = 0; + return pbuff - pbegin; +} + + +#ifdef UTF8_UNIT_TEST + +// Compile this test case with something like: +// +// gcc utf8.cpp -g -I.. -DUTF8_UNIT_TEST -lstdc++ -o utf8_test +// +// or +// +// cl utf8.cpp -Zi -Od -DUTF8_UNIT_TEST -I.. +// +// If possible, try running the test program with the first arg +// pointing at the file: +// +// http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt +// +// and examine the results by eye to make sure they are acceptable to +// you. + + +#include "base/utility.h" +#include <stdio.h> + + +bool check_equal(const char* utf8_in, const uint32_t* ucs_in) +{ + for (;;) + { + uint32_t next_ucs = *ucs_in++; + uint32_t next_ucs_from_utf8 = utf8::decode_next_unicode_character(&utf8_in); + if (next_ucs != next_ucs_from_utf8) + { + return false; + } + if (next_ucs == 0) + { + OVR_ASSERT(next_ucs_from_utf8 == 0); + break; + } + } + + return true; +} + + +void log_ascii(const char* line) +{ + for (;;) + { + unsigned char c = (unsigned char) *line++; + if (c == 0) + { + // End of line. + return; + } + else if (c != '\n' + && (c < 32 || c > 127)) + { + // Non-printable as plain ASCII. + printf("<0x%02X>", (int) c); + } + else + { + printf("%c", c); + } + } +} + + +void log_ucs(const uint32_t* line) +{ + for (;;) + { + uint32_t uc = *line++; + if (uc == 0) + { + // End of line. + return; + } + else if (uc != '\n' + && (uc < 32 || uc > 127)) + { + // Non-printable as plain ASCII. + printf("<U-%04X>", uc); + } + else + { + printf("%c", (char) uc); + } + } +} + + +// Simple canned test. +int main(int argc, const char* argv[]) +{ + { + const char* test8 = "Ignacio CastaƱo"; + const uint32_t test32[] = + { + 0x49, 0x67, 0x6E, 0x61, 0x63, + 0x69, 0x6F, 0x20, 0x43, 0x61, + 0x73, 0x74, 0x61, 0xF1, 0x6F, + 0x00 + }; + + OVR_ASSERT(check_equal(test8, test32)); + } + + // If user passed an arg, try reading the file as UTF-8 encoded text. + if (argc > 1) + { + const char* filename = argv[1]; + FILE* fp = fopen(filename, "rb"); + if (fp == NULL) + { + printf("Can't open file '%s'\n", filename); + return 1; + } + + // Read lines from the file, encode/decode them, and highlight discrepancies. + const int LINE_SIZE = 200; // max line size + char line_buffer_utf8[LINE_SIZE]; + char reencoded_utf8[6 * LINE_SIZE]; + uint32_t line_buffer_ucs[LINE_SIZE]; + + int byte_counter = 0; + for (;;) + { + int c = fgetc(fp); + if (c == EOF) + { + // Done. + break; + } + line_buffer_utf8[byte_counter++] = c; + if (c == '\n' || byte_counter >= LINE_SIZE - 2) + { + // End of line. Process the line. + line_buffer_utf8[byte_counter++] = 0; // terminate. + + // Decode into UCS. + const char* p = line_buffer_utf8; + uint32_t* q = line_buffer_ucs; + for (;;) + { + uint32_t uc = UTF8Util::DecodeNextChar(&p); + *q++ = uc; + + OVR_ASSERT(q < line_buffer_ucs + LINE_SIZE); + OVR_ASSERT(p < line_buffer_utf8 + LINE_SIZE); + + if (uc == 0) break; + } + + // Encode back into UTF-8. + q = line_buffer_ucs; + int index = 0; + for (;;) + { + uint32_t uc = *q++; + OVR_ASSERT(index < LINE_SIZE * 6 - 6); + int last_index = index; + UTF8Util::EncodeChar(reencoded_utf8, &index, uc); + OVR_ASSERT(index <= last_index + 6); + if (uc == 0) break; + } + + // This can be useful for debugging. +#if 0 + // Show the UCS and the re-encoded UTF-8. + log_ucs(line_buffer_ucs); + log_ascii(reencoded_utf8); +#endif // 0 + + OVR_ASSERT(check_equal(line_buffer_utf8, line_buffer_ucs)); + OVR_ASSERT(check_equal(reencoded_utf8, line_buffer_ucs)); + + // Start next line. + byte_counter = 0; + } + } + + fclose(fp); + } + + return 0; +} + + +#endif // UTF8_UNIT_TEST + +}} // namespace UTF8Util::OVR + |