tokeniser.hh

Go to the documentation of this file.
00001 /* vim: set sw=4 sts=4 et foldmethod=syntax : */
00002 
00003 /*
00004  * Copyright (c) 2006, 2007 Ciaran McCreesh
00005  *
00006  * This file is part of the Paludis package manager. Paludis is free software;
00007  * you can redistribute it and/or modify it under the terms of the GNU General
00008  * Public License version 2, as published by the Free Software Foundation.
00009  *
00010  * Paludis is distributed in the hope that it will be useful, but WITHOUT ANY
00011  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
00012  * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
00013  * details.
00014  *
00015  * You should have received a copy of the GNU General Public License along with
00016  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
00017  * Place, Suite 330, Boston, MA  02111-1307  USA
00018  */
00019 
00020 #ifndef PALUDIS_GUARD_PALUDIS_TOKENISER_HH
00021 #define PALUDIS_GUARD_PALUDIS_TOKENISER_HH 1
00022 
00023 #include <iterator>
00024 #include <paludis/util/instantiation_policy.hh>
00025 #include <paludis/util/exception.hh>
00026 #include <paludis/util/stringify.hh>
00027 #include <string>
00028 
00029 /** \file
00030  * Declarations for Tokeniser and related utilities.
00031  *
00032  * \ingroup g_strings
00033  *
00034  * \section Examples
00035  *
00036  * - None at this time.
00037  */
00038 
00039 namespace paludis
00040 {
00041     /**
00042      * Delimiter policy for Tokeniser.
00043      *
00044      * \ingroup g_strings
00045      */
00046     namespace delim_kind
00047     {
00048         /**
00049          * Any of the characters split, and the delimiter is discarded.
00050          *
00051          * \ingroup g_strings
00052          */
00053         struct AnyOfTag;
00054     }
00055 
00056     /**
00057      * Delimiter mode for Tokeniser.
00058      *
00059      * \ingroup g_strings
00060      */
00061     namespace delim_mode
00062     {
00063         /**
00064          * Discard the delimiters.
00065          *
00066          * \ingroup g_strings
00067          */
00068         struct DelimiterTag;
00069 
00070         /**
00071          * Keep the delimiters.
00072          *
00073          * \ingroup g_strings
00074          */
00075         struct BoundaryTag;
00076     }
00077 
00078     /**
00079      * Tokeniser internal use only.
00080      *
00081      * \ingroup g_strings
00082      */
00083     namespace tokeniser_internals
00084     {
00085         /**
00086          * A Writer handles Tokeniser's writes.
00087          *
00088          * \ingroup g_strings
00089          */
00090         template <typename DelimMode_, typename Iter_>
00091         struct Writer;
00092 
00093         /**
00094          * A Writer handles Tokeniser's writes (specialisation for
00095          * delim_mode::DelimiterTag).
00096          *
00097          * \ingroup g_strings
00098          */
00099         template <typename Iter_>
00100         struct Writer<delim_mode::DelimiterTag, Iter_>
00101         {
00102             /**
00103              * Handle a token.
00104              */
00105             static void handle_token(const std::string & s, Iter_ & i)
00106             {
00107                 *i++ = s;
00108             }
00109 
00110             /**
00111              * Handle a delimiter.
00112              */
00113             static void handle_delim(const std::string &, const Iter_ &)
00114             {
00115             }
00116         };
00117 
00118         /**
00119          * A Writer handles Tokeniser's writes (specialisation for
00120          * delim_mode::BoundaryTag).
00121          *
00122          * \ingroup g_strings
00123          */
00124         template <typename Iter_>
00125         struct Writer<delim_mode::BoundaryTag, Iter_>
00126         {
00127             /**
00128              * Handle a token.
00129              */
00130             static void handle_token(const std::string & s, Iter_ & i)
00131             {
00132                 *i++ = s;
00133             }
00134 
00135             /**
00136              * Handle a delimiter.
00137              */
00138             static void handle_delim(const std::string & s, Iter_ & i)
00139             {
00140                 *i++ = s;
00141             }
00142         };
00143 
00144         struct Lexer
00145         {
00146             const std::string text;
00147             std::string::size_type text_pos;
00148             std::string delims;
00149             const std::string quotes;
00150 
00151             std::string value;
00152             enum { t_quote, t_delim, t_text } kind;
00153 
00154             Lexer(const std::string & t, const std::string & d, const std::string & q) :
00155                 text(t),
00156                 text_pos(0),
00157                 delims(d),
00158                 quotes(q)
00159             {
00160             }
00161 
00162             bool next()
00163             {
00164                 if (text_pos >= text.length())
00165                     return false;
00166 
00167                 if (std::string::npos != delims.find(text[text_pos]))
00168                 {
00169                     std::string::size_type start_pos(text_pos);
00170                     while (++text_pos < text.length())
00171                         if (std::string::npos == delims.find(text[text_pos]))
00172                             break;
00173 
00174                     value = text.substr(start_pos, text_pos - start_pos);
00175                     kind = t_delim;
00176                 }
00177                 else if (std::string::npos != quotes.find(text[text_pos]))
00178                 {
00179                     value = std::string(1, text[text_pos]);
00180                     kind = t_quote;
00181                     ++text_pos;
00182                 }
00183                 else
00184                 {
00185                     std::string::size_type start_pos(text_pos);
00186                     while (++text_pos < text.length())
00187                         if (std::string::npos != delims.find(text[text_pos]))
00188                             break;
00189                         else if (std::string::npos != quotes.find(text[text_pos]))
00190                             break;
00191                     value = text.substr(start_pos, text_pos - start_pos);
00192                     kind = t_text;
00193                 }
00194 
00195                 return true;
00196             }
00197         };
00198 
00199         template <typename DelimKind_, typename DelimMode_ = delim_mode::DelimiterTag>
00200         struct Tokeniser;
00201 
00202         template <typename DelimMode_>
00203         class Tokeniser<delim_kind::AnyOfTag, DelimMode_>
00204         {
00205             private:
00206                 Tokeniser();
00207 
00208             public:
00209                 template <typename Iter_>
00210                 static void tokenise(const std::string & s,
00211                         const std::string & delims,
00212                         const std::string & quotes,
00213                         Iter_ iter);
00214         };
00215     }
00216 
00217     /**
00218      * Thrown if a Tokeniser encounters a syntax error (for example, mismatched quotes).
00219      *
00220      * \ingroup g_strings
00221      * \since 0.26
00222      */
00223     class PALUDIS_VISIBLE TokeniserError :
00224         public Exception
00225     {
00226         public:
00227             ///\name Basic operations
00228             ///\{
00229 
00230             TokeniserError(const std::string & s, const std::string & msg) throw ();
00231 
00232             ///\}
00233     };
00234 
00235     template <typename DelimMode_>
00236     template <typename Iter_>
00237     void
00238     tokeniser_internals::Tokeniser<delim_kind::AnyOfTag, DelimMode_>::tokenise(
00239             const std::string & s,
00240             const std::string & delims,
00241             const std::string & quotes,
00242             Iter_ iter)
00243     {
00244         typedef tokeniser_internals::Lexer Lexer;
00245         Lexer l(s, delims, quotes);
00246 
00247         enum { s_initial, s_had_quote, s_had_text, s_had_quote_text, s_had_quote_text_quote } state = s_initial;
00248 
00249         while (l.next())
00250         {
00251             switch (state)
00252             {
00253                 case s_initial:
00254                     switch (l.kind)
00255                     {
00256                         case Lexer::t_quote:
00257                             state = s_had_quote;
00258                             l.delims = "";
00259                             break;
00260 
00261                         case Lexer::t_delim:
00262                             state = s_initial;
00263                             tokeniser_internals::Writer<DelimMode_, Iter_>::handle_delim(l.value, iter);
00264                             break;
00265 
00266                         case Lexer::t_text:
00267                             state = s_had_text;
00268                             tokeniser_internals::Writer<DelimMode_, Iter_>::handle_token(l.value, iter);
00269                             break;
00270                     }
00271                     break;
00272 
00273                 case s_had_quote:
00274                     switch (l.kind)
00275                     {
00276                         case Lexer::t_quote:
00277                             state = s_had_quote_text_quote;
00278                             l.delims = delims;
00279                             tokeniser_internals::Writer<DelimMode_, Iter_>::handle_token("", iter);
00280                             break;
00281 
00282                         case Lexer::t_delim:
00283                             throw InternalError(PALUDIS_HERE, "t_delim in s_had_quote");
00284                             break;
00285 
00286                         case Lexer::t_text:
00287                             state = s_had_quote_text;
00288                             tokeniser_internals::Writer<DelimMode_, Iter_>::handle_token(l.value, iter);
00289                             break;
00290                     }
00291                     break;
00292 
00293                 case s_had_quote_text:
00294                     switch (l.kind)
00295                     {
00296                         case Lexer::t_text:
00297                             throw InternalError(PALUDIS_HERE, "t_text in s_had_quote_text");
00298                             break;
00299 
00300                         case Lexer::t_delim:
00301                             throw InternalError(PALUDIS_HERE, "t_delim in s_had_quote_text");
00302                             break;
00303 
00304                         case Lexer::t_quote:
00305                             state = s_had_quote_text_quote;
00306                             l.delims = delims;
00307                             break;
00308                     }
00309                     break;
00310 
00311                 case s_had_quote_text_quote:
00312                     switch (l.kind)
00313                     {
00314                         case Lexer::t_text:
00315                             throw TokeniserError(s, "Close quote followed by text");
00316                             break;
00317 
00318                         case Lexer::t_quote:
00319                             throw TokeniserError(s, "Close quote followed by quote");
00320                             break;
00321 
00322                         case Lexer::t_delim:
00323                             state = s_initial;
00324                             tokeniser_internals::Writer<DelimMode_, Iter_>::handle_delim(l.value, iter);
00325                             break;
00326                     }
00327                     break;
00328 
00329                 case s_had_text:
00330                     switch (l.kind)
00331                     {
00332                         case Lexer::t_text:
00333                             throw InternalError(PALUDIS_HERE, "t_text in s_had_text");
00334                             break;
00335 
00336                         case Lexer::t_quote:
00337                             throw TokeniserError(s, "Text followed by quote");
00338                             break;
00339 
00340                         case Lexer::t_delim:
00341                             state = s_initial;
00342                             tokeniser_internals::Writer<DelimMode_, Iter_>::handle_delim(l.value, iter);
00343                             break;
00344                     }
00345                     break;
00346             }
00347         }
00348 
00349         switch (state)
00350         {
00351             case s_initial:
00352             case s_had_text:
00353             case s_had_quote_text_quote:
00354                 return;
00355 
00356             case s_had_quote:
00357             case s_had_quote_text:
00358                 throw TokeniserError(s, "Unterminated quoted string");
00359         }
00360     }
00361 
00362     /**
00363      * Tokenise a string.
00364      *
00365      * \ingroup g_strings
00366      * \since 0.26
00367      */
00368     template <typename DelimKind_, typename DelimMode_, typename Iter_>
00369     void tokenise(const std::string & s, const std::string & delims, const std::string & quotes, Iter_ iter)
00370     {
00371         tokeniser_internals::Tokeniser<DelimKind_, DelimMode_>::template tokenise<Iter_>(s, delims, quotes, iter);
00372     }
00373 
00374     /**
00375      * Convenience function: tokenise on whitespace.
00376      *
00377      * \ingroup g_strings
00378      * \since 0.26
00379      */
00380     template <typename Iter_>
00381     void tokenise_whitespace(const std::string & s, Iter_ iter)
00382     {
00383         tokenise<delim_kind::AnyOfTag, delim_mode::DelimiterTag>(s, " \t\r\n", "", iter);
00384     }
00385 
00386     /**
00387      * Convenience function: tokenise on whitespace, handling quoted strings.
00388      *
00389      * \ingroup g_strings
00390      * \since 0.26
00391      */
00392     template <typename Iter_>
00393     void tokenise_whitespace_quoted(const std::string &s, Iter_ iter)
00394     {
00395         tokenise<delim_kind::AnyOfTag, delim_mode::DelimiterTag>(s, " \t\r\n", "'\"", iter);
00396     }
00397 }
00398 
00399 #endif

Generated on Mon Sep 21 10:36:08 2009 for paludis by  doxygen 1.5.4