00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #ifndef PALUDIS_GUARD_PALUDIS_TOKENISER_HH
00021 #define PALUDIS_GUARD_PALUDIS_TOKENISER_HH 1
00022
00023 #include <iterator>
00024 #include <paludis/util/instantiation_policy.hh>
00025 #include <paludis/util/exception.hh>
00026 #include <paludis/util/stringify.hh>
00027 #include <string>
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039 namespace paludis
00040 {
00041
00042
00043
00044
00045
00046 namespace delim_kind
00047 {
00048
00049
00050
00051
00052
00053 struct AnyOfTag;
00054 }
00055
00056
00057
00058
00059
00060
00061 namespace delim_mode
00062 {
00063
00064
00065
00066
00067
00068 struct DelimiterTag;
00069
00070
00071
00072
00073
00074
00075 struct BoundaryTag;
00076 }
00077
00078
00079
00080
00081
00082
00083 namespace tokeniser_internals
00084 {
00085
00086
00087
00088
00089
00090 template <typename DelimMode_, typename Iter_>
00091 struct Writer;
00092
00093
00094
00095
00096
00097
00098
00099 template <typename Iter_>
00100 struct Writer<delim_mode::DelimiterTag, Iter_>
00101 {
00102
00103
00104
00105 static void handle_token(const std::string & s, Iter_ & i)
00106 {
00107 *i++ = s;
00108 }
00109
00110
00111
00112
00113 static void handle_delim(const std::string &, const Iter_ &)
00114 {
00115 }
00116 };
00117
00118
00119
00120
00121
00122
00123
00124 template <typename Iter_>
00125 struct Writer<delim_mode::BoundaryTag, Iter_>
00126 {
00127
00128
00129
00130 static void handle_token(const std::string & s, Iter_ & i)
00131 {
00132 *i++ = s;
00133 }
00134
00135
00136
00137
00138 static void handle_delim(const std::string & s, Iter_ & i)
00139 {
00140 *i++ = s;
00141 }
00142 };
00143
00144 struct Lexer
00145 {
00146 const std::string text;
00147 std::string::size_type text_pos;
00148 std::string delims;
00149 const std::string quotes;
00150
00151 std::string value;
00152 enum { t_quote, t_delim, t_text } kind;
00153
00154 Lexer(const std::string & t, const std::string & d, const std::string & q) :
00155 text(t),
00156 text_pos(0),
00157 delims(d),
00158 quotes(q)
00159 {
00160 }
00161
00162 bool next()
00163 {
00164 if (text_pos >= text.length())
00165 return false;
00166
00167 if (std::string::npos != delims.find(text[text_pos]))
00168 {
00169 std::string::size_type start_pos(text_pos);
00170 while (++text_pos < text.length())
00171 if (std::string::npos == delims.find(text[text_pos]))
00172 break;
00173
00174 value = text.substr(start_pos, text_pos - start_pos);
00175 kind = t_delim;
00176 }
00177 else if (std::string::npos != quotes.find(text[text_pos]))
00178 {
00179 value = std::string(1, text[text_pos]);
00180 kind = t_quote;
00181 ++text_pos;
00182 }
00183 else
00184 {
00185 std::string::size_type start_pos(text_pos);
00186 while (++text_pos < text.length())
00187 if (std::string::npos != delims.find(text[text_pos]))
00188 break;
00189 else if (std::string::npos != quotes.find(text[text_pos]))
00190 break;
00191 value = text.substr(start_pos, text_pos - start_pos);
00192 kind = t_text;
00193 }
00194
00195 return true;
00196 }
00197 };
00198
00199 template <typename DelimKind_, typename DelimMode_ = delim_mode::DelimiterTag>
00200 struct Tokeniser;
00201
00202 template <typename DelimMode_>
00203 class Tokeniser<delim_kind::AnyOfTag, DelimMode_>
00204 {
00205 private:
00206 Tokeniser();
00207
00208 public:
00209 template <typename Iter_>
00210 static void tokenise(const std::string & s,
00211 const std::string & delims,
00212 const std::string & quotes,
00213 Iter_ iter);
00214 };
00215 }
00216
00217
00218
00219
00220
00221
00222
00223 class PALUDIS_VISIBLE TokeniserError :
00224 public Exception
00225 {
00226 public:
00227
00228
00229
00230 TokeniserError(const std::string & s, const std::string & msg) throw ();
00231
00232
00233 };
00234
00235 template <typename DelimMode_>
00236 template <typename Iter_>
00237 void
00238 tokeniser_internals::Tokeniser<delim_kind::AnyOfTag, DelimMode_>::tokenise(
00239 const std::string & s,
00240 const std::string & delims,
00241 const std::string & quotes,
00242 Iter_ iter)
00243 {
00244 typedef tokeniser_internals::Lexer Lexer;
00245 Lexer l(s, delims, quotes);
00246
00247 enum { s_initial, s_had_quote, s_had_text, s_had_quote_text, s_had_quote_text_quote } state = s_initial;
00248
00249 while (l.next())
00250 {
00251 switch (state)
00252 {
00253 case s_initial:
00254 switch (l.kind)
00255 {
00256 case Lexer::t_quote:
00257 state = s_had_quote;
00258 l.delims = "";
00259 break;
00260
00261 case Lexer::t_delim:
00262 state = s_initial;
00263 tokeniser_internals::Writer<DelimMode_, Iter_>::handle_delim(l.value, iter);
00264 break;
00265
00266 case Lexer::t_text:
00267 state = s_had_text;
00268 tokeniser_internals::Writer<DelimMode_, Iter_>::handle_token(l.value, iter);
00269 break;
00270 }
00271 break;
00272
00273 case s_had_quote:
00274 switch (l.kind)
00275 {
00276 case Lexer::t_quote:
00277 state = s_had_quote_text_quote;
00278 l.delims = delims;
00279 tokeniser_internals::Writer<DelimMode_, Iter_>::handle_token("", iter);
00280 break;
00281
00282 case Lexer::t_delim:
00283 throw InternalError(PALUDIS_HERE, "t_delim in s_had_quote");
00284 break;
00285
00286 case Lexer::t_text:
00287 state = s_had_quote_text;
00288 tokeniser_internals::Writer<DelimMode_, Iter_>::handle_token(l.value, iter);
00289 break;
00290 }
00291 break;
00292
00293 case s_had_quote_text:
00294 switch (l.kind)
00295 {
00296 case Lexer::t_text:
00297 throw InternalError(PALUDIS_HERE, "t_text in s_had_quote_text");
00298 break;
00299
00300 case Lexer::t_delim:
00301 throw InternalError(PALUDIS_HERE, "t_delim in s_had_quote_text");
00302 break;
00303
00304 case Lexer::t_quote:
00305 state = s_had_quote_text_quote;
00306 l.delims = delims;
00307 break;
00308 }
00309 break;
00310
00311 case s_had_quote_text_quote:
00312 switch (l.kind)
00313 {
00314 case Lexer::t_text:
00315 throw TokeniserError(s, "Close quote followed by text");
00316 break;
00317
00318 case Lexer::t_quote:
00319 throw TokeniserError(s, "Close quote followed by quote");
00320 break;
00321
00322 case Lexer::t_delim:
00323 state = s_initial;
00324 tokeniser_internals::Writer<DelimMode_, Iter_>::handle_delim(l.value, iter);
00325 break;
00326 }
00327 break;
00328
00329 case s_had_text:
00330 switch (l.kind)
00331 {
00332 case Lexer::t_text:
00333 throw InternalError(PALUDIS_HERE, "t_text in s_had_text");
00334 break;
00335
00336 case Lexer::t_quote:
00337 throw TokeniserError(s, "Text followed by quote");
00338 break;
00339
00340 case Lexer::t_delim:
00341 state = s_initial;
00342 tokeniser_internals::Writer<DelimMode_, Iter_>::handle_delim(l.value, iter);
00343 break;
00344 }
00345 break;
00346 }
00347 }
00348
00349 switch (state)
00350 {
00351 case s_initial:
00352 case s_had_text:
00353 case s_had_quote_text_quote:
00354 return;
00355
00356 case s_had_quote:
00357 case s_had_quote_text:
00358 throw TokeniserError(s, "Unterminated quoted string");
00359 }
00360 }
00361
00362
00363
00364
00365
00366
00367
00368 template <typename DelimKind_, typename DelimMode_, typename Iter_>
00369 void tokenise(const std::string & s, const std::string & delims, const std::string & quotes, Iter_ iter)
00370 {
00371 tokeniser_internals::Tokeniser<DelimKind_, DelimMode_>::template tokenise<Iter_>(s, delims, quotes, iter);
00372 }
00373
00374
00375
00376
00377
00378
00379
00380 template <typename Iter_>
00381 void tokenise_whitespace(const std::string & s, Iter_ iter)
00382 {
00383 tokenise<delim_kind::AnyOfTag, delim_mode::DelimiterTag>(s, " \t\r\n", "", iter);
00384 }
00385
00386
00387
00388
00389
00390
00391
00392 template <typename Iter_>
00393 void tokenise_whitespace_quoted(const std::string &s, Iter_ iter)
00394 {
00395 tokenise<delim_kind::AnyOfTag, delim_mode::DelimiterTag>(s, " \t\r\n", "'\"", iter);
00396 }
00397 }
00398
00399 #endif