Open Chinese Convert  1.1.2
A project for conversion between Traditional and Simplified Chinese
UTF8StringSlice.hpp
1 /*
2  * Open Chinese Convert
3  *
4  * Copyright 2015 Carbo Kuo <byvoid@byvoid.com>
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 #include <cstring>
20 
21 #include "Common.hpp"
22 #include "UTF8Util.hpp"
23 
24 namespace opencc {
25 
26 namespace internal {
27 
28 inline size_t FNVHash(const char* text, const size_t byteLength,
29  const size_t FNV_prime, const size_t FNV_offset_basis) {
30  size_t hash = FNV_offset_basis;
31  for (const char* pstr = text; pstr < text + byteLength; pstr++) {
32  hash ^= *pstr;
33  hash *= FNV_prime;
34  }
35  return hash;
36 }
37 
38 template <int> size_t FNVHash(const char* text, const size_t byteLength);
39 
40 template <>
41 inline size_t FNVHash<4>(const char* text, const size_t byteLength) {
42  return FNVHash(text, byteLength, 16777619UL, 2166136261UL);
43 }
44 
45 #if SIZE_MAX == 0xffffffffffffffff
46 template <>
47 inline size_t FNVHash<8>(const char* text, const size_t byteLength) {
48  return FNVHash(text, byteLength, 1099511628211UL, 14695981039346656037UL);
49 }
50 #endif
51 
52 } // namespace internal
53 
54 template <typename LENGTH_TYPE> class UTF8StringSliceBase {
55 public:
56  typedef LENGTH_TYPE LengthType;
57 
58  UTF8StringSliceBase(const char* _str)
59  : str(_str), utf8Length(static_cast<LengthType>(UTF8Util::Length(_str))),
60  byteLength(static_cast<LengthType>(strlen(_str))) {}
61 
62  UTF8StringSliceBase(const char* _str, const LengthType _utf8Length)
63  : str(_str), utf8Length(_utf8Length) {
64  CalculateByteLength();
65  }
66 
67  UTF8StringSliceBase(const char* _str, const LengthType _utf8Length,
68  const LengthType _byteLength)
69  : str(_str), utf8Length(_utf8Length), byteLength(_byteLength) {
70  CalculateByteLength();
71  }
72 
73  LengthType UTF8Length() const { return utf8Length; }
74 
75  LengthType ByteLength() const { return byteLength; }
76 
77  UTF8StringSliceBase Left(const LengthType numberOfCharacters) const {
78  if (numberOfCharacters == UTF8Length()) {
79  return *this;
80  } else {
81  return UTF8StringSliceBase(str, numberOfCharacters);
82  }
83  }
84 
85  UTF8StringSliceBase Right(const LengthType numberOfCharacters) const {
86  if (numberOfCharacters == UTF8Length()) {
87  return *this;
88  } else {
89  const char* pstr = str + byteLength;
90  for (size_t i = 0; i < numberOfCharacters; i++) {
91  pstr = UTF8Util::PrevChar(pstr);
92  }
93  return UTF8StringSliceBase(pstr, numberOfCharacters);
94  }
95  }
96 
97  UTF8StringSliceBase SubString(const LengthType offset,
98  const LengthType numberOfCharacters) const {
99  if (offset == 0) {
100  return Left(numberOfCharacters);
101  } else {
102  const char* pstr = str;
103  for (size_t i = 0; i < offset; i++) {
104  pstr = UTF8Util::NextChar(pstr);
105  }
106  return UTF8StringSliceBase(pstr, numberOfCharacters);
107  }
108  }
109 
110  std::string ToString() const { return std::string(str, str + byteLength); }
111 
112  const char* CString() const { return str; }
113 
114  LengthType CommonPrefixLength(const UTF8StringSliceBase& that) const {
115  if (str == that.str) {
116  return (std::min)(utf8Length, that.utf8Length);
117  } else {
118  const char* pstr1 = str;
119  const char* pstr2 = that.str;
120  for (size_t length = 0; length < utf8Length && length < that.utf8Length;
121  length++) {
122  size_t charLen1 = UTF8Util::NextCharLength(pstr1);
123  size_t charLen2 = UTF8Util::NextCharLength(pstr2);
124  if (charLen1 != charLen2 || strncmp(pstr1, pstr2, charLen1) != 0) {
125  return length;
126  }
127  pstr1 += charLen1;
128  pstr2 += charLen2;
129  }
130  return 0;
131  }
132  }
133 
134  void MoveRight() {
135  if (utf8Length > 0) {
136  const size_t charLen = UTF8Util::NextCharLength(str);
137  str += charLen;
138  utf8Length--;
139  byteLength -= charLen;
140  }
141  }
142 
143  void MoveLeft() {
144  if (utf8Length > 0) {
145  const size_t charLen = UTF8Util::PrevCharLength(str + byteLength);
146  utf8Length--;
147  byteLength -= charLen;
148  }
149  }
150 
151  int ReverseCompare(const UTF8StringSliceBase& that) const {
152  const char* pstr1 = str + byteLength;
153  const char* pstr2 = that.str + that.byteLength;
154  const size_t length = (std::min)(utf8Length, that.utf8Length);
155  for (size_t i = 0; i < length; i++) {
156  const size_t charLen1 = UTF8Util::PrevCharLength(pstr1);
157  const size_t charLen2 = UTF8Util::PrevCharLength(pstr2);
158  pstr1 -= charLen1;
159  pstr2 -= charLen2;
160  const int cmp = strncmp(pstr1, pstr2, (std::min)(charLen1, charLen2));
161  if (cmp < 0) {
162  return -1;
163  } else if (cmp > 0) {
164  return 1;
165  } else if (charLen1 < charLen2) {
166  return -1;
167  } else if (charLen1 > charLen2) {
168  return 1;
169  }
170  }
171  if (utf8Length < that.utf8Length) {
172  return -1;
173  } else if (utf8Length > that.utf8Length) {
174  return 1;
175  } else {
176  return 0;
177  }
178  }
179 
180  LengthType FindBytePosition(const UTF8StringSliceBase& pattern) const {
181  return static_cast<LengthType>(
182  ToString().find(pattern.str, 0, pattern.byteLength));
183  }
184 
185  bool operator<(const UTF8StringSliceBase& that) const {
186  return Compare(that) < 0;
187  }
188 
189  bool operator>(const UTF8StringSliceBase& that) const {
190  return Compare(that) > 0;
191  }
192 
193  bool operator==(const UTF8StringSliceBase& that) const {
194  return (str == that.str && utf8Length == that.utf8Length) ||
195  Compare(that) == 0;
196  }
197 
198  bool operator!=(const UTF8StringSliceBase& that) const {
199  return !this->operator==(that);
200  }
201 
202  class Hasher {
203  public:
204  size_t operator()(const UTF8StringSliceBase& text) const {
205  return internal::FNVHash<sizeof(size_t)>(text.CString(),
206  text.ByteLength());
207  }
208  };
209 
210 private:
211  inline int Compare(const UTF8StringSliceBase& that) const {
212  int cmp = strncmp(str, that.str, (std::min)(byteLength, that.byteLength));
213  if (cmp == 0) {
214  if (utf8Length < that.utf8Length) {
215  cmp = -1;
216  } else if (utf8Length > that.utf8Length) {
217  cmp = 1;
218  } else {
219  cmp = 0;
220  }
221  }
222  return cmp;
223  }
224 
225  void CalculateByteLength() {
226  const char* pstr = str;
227  for (size_t i = 0; i < utf8Length; i++) {
228  pstr = UTF8Util::NextChar(pstr);
229  }
230  byteLength = static_cast<LengthType>(pstr - str);
231  }
232 
233  const char* str;
234  LengthType utf8Length;
235  LengthType byteLength;
236 };
237 
238 typedef UTF8StringSliceBase<size_t> UTF8StringSlice;
239 
240 template <typename LENGTH_TYPE>
241 std::ostream& operator<<(::std::ostream& os,
242  const UTF8StringSliceBase<LENGTH_TYPE>& str) {
243  return os << str.ToString();
244 }
245 
246 } // namespace opencc
Definition: UTF8StringSlice.hpp:202
Definition: UTF8StringSlice.hpp:54
static size_t PrevCharLength(const char *str)
Returns the length in byte for the previous UTF8 character.
Definition: UTF8Util.hpp:80
static const char * PrevChar(const char *str)
Move the char* pointer before the previous UTF8 character.
Definition: UTF8Util.hpp:118
static size_t Length(const char *str)
Returns the UTF8 length of a valid UTF8 std::string.
Definition: UTF8Util.hpp:125
static const char * NextChar(const char *str)
Returns the char* pointer over the next UTF8 character.
Definition: UTF8Util.hpp:111
static size_t NextCharLength(const char *str)
Returns the length in byte for the next UTF8 character.
Definition: UTF8Util.hpp:69