regex_nfa.h

Go to the documentation of this file.
00001 // class template regex -*- C++ -*-
00002 
00003 // Copyright (C) 2010 Free Software Foundation, Inc.
00004 //
00005 // This file is part of the GNU ISO C++ Library.  This library is free
00006 // software; you can redistribute it and/or modify it under the
00007 // terms of the GNU General Public License as published by the
00008 // Free Software Foundation; either version 3, or (at your option)
00009 // any later version.
00010 
00011 // This library is distributed in the hope that it will be useful,
00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014 // GNU General Public License for more details.
00015 
00016 // Under Section 7 of GPL version 3, you are granted additional
00017 // permissions described in the GCC Runtime Library Exception, version
00018 // 3.1, as published by the Free Software Foundation.
00019 
00020 // You should have received a copy of the GNU General Public License and
00021 // a copy of the GCC Runtime Library Exception along with this program;
00022 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
00023 // <http://www.gnu.org/licenses/>.
00024 
00025 /**
00026  * @file bits/regex_nfa.h
00027  * This is an internal header file, included by other library headers.
00028  * You should not attempt to use it directly.
00029  */
00030 
00031 namespace std
00032 {
00033 namespace __regex
00034 {
00035 
00036   // Base class for, um, automata.  Could be an NFA or a DFA.  Your choice.
00037   class _Automaton
00038   {
00039   public:
00040     typedef unsigned int _SizeT;
00041 
00042   public:
00043     virtual
00044     ~_Automaton()
00045     { }
00046 
00047     virtual _SizeT
00048     _M_sub_count() const = 0;
00049 
00050 #ifdef _GLIBCXX_DEBUG
00051     virtual std::ostream&
00052     _M_dot(std::ostream& __ostr) const = 0;
00053 #endif
00054   };
00055 
00056   // Generic shred pointer to an automaton.  
00057   typedef std::shared_ptr<_Automaton> _AutomatonPtr;
00058 
00059   // Operation codes that define the type of transitions within the base NFA
00060   // that represents the regular expression.
00061   enum _Opcode
00062   {
00063       _S_opcode_unknown       =   0,
00064       _S_opcode_alternative   =   1,
00065       _S_opcode_subexpr_begin =   4,
00066       _S_opcode_subexpr_end   =   5,
00067       _S_opcode_match         = 100,
00068       _S_opcode_accept        = 255
00069   };
00070 
00071   // Provides a generic facade for a templated match_results.
00072   struct _Results
00073   {
00074     virtual void _M_set_pos(int __i, int __j, const _PatternCursor& __p) = 0;
00075     virtual void _M_set_matched(int __i, bool __is_matched) = 0;
00076   };
00077 
00078   // Tags current state (for subexpr begin/end).
00079   typedef std::function<void (const _PatternCursor&, _Results&)> _Tagger;
00080 
00081   template<typename _FwdIterT, typename _TraitsT>
00082     struct _StartTagger
00083     : public _Tagger
00084     {
00085       explicit
00086       _StartTagger(int __i)
00087       : _M_index(__i)
00088       { }
00089 
00090       void
00091       operator()(const _PatternCursor& __pc, _Results& __r)
00092       { __r._M_set_pos(_M_index, 0, __pc); }
00093 
00094       int       _M_index;
00095     };
00096 
00097   template<typename _FwdIterT, typename _TraitsT>
00098     struct _EndTagger
00099     : public _Tagger
00100     {
00101       explicit
00102       _EndTagger(int __i)
00103       : _M_index(__i)
00104       { }
00105 
00106       void
00107       operator()(const _PatternCursor& __pc, _Results& __r)
00108       { __r._M_set_pos(_M_index, 1, __pc); }
00109 
00110       int       _M_index;
00111       _FwdIterT _M_pos;
00112     };
00113   // Indicates if current state matches cursor current.
00114   typedef std::function<bool (const _PatternCursor&)> _Matcher;
00115 
00116   // Matches any character
00117   inline bool
00118   _AnyMatcher(const _PatternCursor&)
00119   { return true; }
00120 
00121   // Matches a single character
00122   template<typename _InIterT, typename _TraitsT>
00123     struct _CharMatcher
00124     : public _Matcher
00125     {
00126       typedef typename _TraitsT::char_type char_type;
00127 
00128       explicit
00129       _CharMatcher(char_type __c, const _TraitsT& __t = _TraitsT())
00130       : _M_traits(__t), _M_c(_M_traits.translate(__c))
00131       { }
00132 
00133       bool
00134       operator()(const _PatternCursor& __pc) const
00135       {
00136     typedef const _SpecializedCursor<_InIterT>& _CursorT;
00137     _CursorT __c = static_cast<_CursorT>(__pc);
00138     return _M_traits.translate(__c._M_current()) == _M_c;
00139       }
00140 
00141       const _TraitsT& _M_traits;
00142       char_type       _M_c;
00143     };
00144 
00145   // Matches a character range (bracket expression)
00146   template<typename _InIterT, typename _TraitsT>
00147     struct _RangeMatcher
00148     : public _Matcher
00149     {
00150       typedef typename _TraitsT::char_type _CharT;
00151       typedef std::basic_string<_CharT>    _StringT;
00152 
00153       explicit
00154       _RangeMatcher(bool __is_non_matching, const _TraitsT& __t = _TraitsT())
00155       : _M_traits(__t), _M_is_non_matching(__is_non_matching)
00156       { }
00157 
00158       bool
00159       operator()(const _PatternCursor& __pc) const
00160       {
00161     typedef const _SpecializedCursor<_InIterT>& _CursorT;
00162     _CursorT __c = static_cast<_CursorT>(__pc);
00163     return true;
00164       }
00165 
00166       void
00167       _M_add_char(_CharT __c)
00168       { }
00169 
00170       void
00171       _M_add_collating_element(const _StringT& __s)
00172       { }
00173 
00174       void
00175       _M_add_equivalence_class(const _StringT& __s)
00176       { }
00177 
00178       void
00179       _M_add_character_class(const _StringT& __s)
00180       { }
00181 
00182       void
00183       _M_make_range()
00184       { }
00185 
00186       const _TraitsT& _M_traits;
00187       bool            _M_is_non_matching;
00188     };
00189 
00190   // Identifies a state in the NFA.
00191   typedef int _StateIdT;
00192 
00193   // The special case in which a state identifier is not an index.
00194   static const _StateIdT _S_invalid_state_id  = -1;
00195 
00196 
00197   // An individual state in an NFA
00198   //
00199   // In this case a "state" is an entry in the NFA definition coupled with its
00200   // outgoing transition(s).  All states have a single outgoing transition,
00201   // except for accepting states (which have no outgoing transitions) and alt
00202   // states, which have two outgoing transitions.
00203   //
00204   struct _State
00205   {
00206     typedef int  _OpcodeT;
00207 
00208     _OpcodeT     _M_opcode;    // type of outgoing transition
00209     _StateIdT    _M_next;      // outgoing tranition
00210     _StateIdT    _M_alt;       // for _S_opcode_alternative
00211     unsigned int _M_subexpr;   // for _S_opcode_subexpr_*
00212     _Tagger      _M_tagger;    // for _S_opcode_subexpr_*
00213     _Matcher     _M_matches;   // for _S_opcode_match
00214 
00215     explicit _State(_OpcodeT __opcode)
00216     : _M_opcode(__opcode), _M_next(_S_invalid_state_id)
00217     { }
00218 
00219     _State(const _Matcher& __m)
00220     : _M_opcode(_S_opcode_match), _M_next(_S_invalid_state_id), _M_matches(__m)
00221     { }
00222 
00223     _State(_OpcodeT __opcode, unsigned int __s, const _Tagger& __t)
00224     : _M_opcode(__opcode), _M_next(_S_invalid_state_id), _M_subexpr(__s),
00225       _M_tagger(__t)
00226     { }
00227 
00228     _State(_StateIdT __next, _StateIdT __alt)
00229     : _M_opcode(_S_opcode_alternative), _M_next(__next), _M_alt(__alt)
00230     { }
00231 
00232 #ifdef _GLIBCXX_DEBUG
00233     std::ostream&
00234     _M_print(std::ostream& ostr) const;
00235 
00236     // Prints graphviz dot commands for state.
00237     std::ostream&
00238     _M_dot(std::ostream& __ostr, _StateIdT __id) const;
00239 #endif
00240   };
00241 
00242   
00243   // The Grep Matcher works on sets of states.  Here are sets of states.
00244   typedef std::set<_StateIdT> _StateSet;
00245 
00246  // A collection of all states making up an NFA
00247   //
00248   // An NFA is a 4-tuple M = (K, S, s, F), where
00249   //    K is a finite set of states,
00250   //    S is the alphabet of the NFA,
00251   //    s is the initial state,
00252   //    F is a set of final (accepting) states.
00253   //
00254   // This NFA class is templated on S, a type that will hold values of the
00255   // underlying alphabet (without regard to semantics of that alphabet).  The
00256   // other elements of the tuple are generated during construction of the NFA
00257   // and are available through accessor member functions.
00258   //
00259   class _Nfa
00260   : public _Automaton, public std::vector<_State>
00261   {
00262   public:
00263     typedef _State                              _StateT;
00264     typedef unsigned int                        _SizeT;
00265     typedef regex_constants::syntax_option_type _FlagT;
00266 
00267   public:
00268     _Nfa(_FlagT __f)
00269     : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0)
00270     { }
00271 
00272     ~_Nfa()
00273     { }
00274 
00275     _FlagT
00276     _M_options() const
00277     { return _M_flags; }
00278 
00279     _StateIdT
00280     _M_start() const
00281     { return _M_start_state; }
00282 
00283     const _StateSet&
00284     _M_final_states() const
00285     { return _M_accepting_states; }
00286 
00287     _SizeT
00288     _M_sub_count() const
00289     { return _M_subexpr_count; }
00290 
00291     _StateIdT
00292     _M_insert_accept()
00293     {
00294       this->push_back(_StateT(_S_opcode_accept));
00295       _M_accepting_states.insert(this->size()-1);
00296       return this->size()-1;
00297     }
00298 
00299     _StateIdT
00300     _M_insert_alt(_StateIdT __next, _StateIdT __alt)
00301     {
00302       this->push_back(_StateT(__next, __alt));
00303       return this->size()-1;
00304     }
00305 
00306     _StateIdT
00307     _M_insert_matcher(_Matcher __m)
00308     {
00309       this->push_back(_StateT(__m));
00310       return this->size()-1;
00311     }
00312 
00313     _StateIdT
00314     _M_insert_subexpr_begin(const _Tagger& __t)
00315     {
00316       this->push_back(_StateT(_S_opcode_subexpr_begin, _M_subexpr_count++, __t));
00317       return this->size()-1;
00318     }
00319 
00320     _StateIdT 
00321     _M_insert_subexpr_end(unsigned int __i, const _Tagger& __t)
00322     {
00323       this->push_back(_StateT(_S_opcode_subexpr_end, __i, __t));
00324       return this->size()-1;
00325     }
00326 
00327 #ifdef _GLIBCXX_DEBUG
00328     std::ostream&
00329     _M_dot(std::ostream& __ostr) const;
00330 #endif
00331 
00332   private:
00333     _FlagT     _M_flags;
00334     _StateIdT  _M_start_state;
00335     _StateSet  _M_accepting_states;
00336     _SizeT     _M_subexpr_count;
00337   };
00338 
00339   // Describes a sequence of one or more %_State, its current start and end(s).
00340   //
00341   // This structure contains fragments of an NFA during construction.
00342   class _StateSeq
00343   {
00344   public:
00345     // Constructs a single-node sequence
00346     _StateSeq(_Nfa& __ss, _StateIdT __s, _StateIdT __e = _S_invalid_state_id)
00347     : _M_nfa(__ss), _M_start(__s), _M_end1(__s), _M_end2(__e)
00348     { }
00349     // Constructs a split sequence from two other sequencces
00350     _StateSeq(const _StateSeq& __e1, const _StateSeq& __e2)
00351     : _M_nfa(__e1._M_nfa),
00352       _M_start(_M_nfa._M_insert_alt(__e1._M_start, __e2._M_start)),
00353       _M_end1(__e1._M_end1), _M_end2(__e2._M_end1)
00354     { }
00355 
00356     // Constructs a split sequence from a single sequence
00357     _StateSeq(const _StateSeq& __e, _StateIdT __id)
00358     : _M_nfa(__e._M_nfa),
00359       _M_start(_M_nfa._M_insert_alt(__id, __e._M_start)),
00360       _M_end1(__id), _M_end2(__e._M_end1)
00361     { }
00362 
00363     // Constructs a copy of a %_StateSeq
00364     _StateSeq(const _StateSeq& __rhs)
00365     : _M_nfa(__rhs._M_nfa), _M_start(__rhs._M_start),
00366       _M_end1(__rhs._M_end1), _M_end2(__rhs._M_end2)
00367     { }
00368 
00369 
00370     _StateSeq& operator=(const _StateSeq& __rhs);
00371 
00372     _StateIdT
00373     _M_front() const
00374     { return _M_start; }
00375 
00376     // Extends a sequence by one.
00377     void
00378     _M_push_back(_StateIdT __id);
00379 
00380     // Extends and maybe joins a sequence.
00381     void
00382     _M_append(_StateIdT __id);
00383 
00384     void
00385     _M_append(_StateSeq& __rhs);
00386 
00387     // Clones an entire sequence.
00388     _StateIdT
00389     _M_clone();
00390 
00391   private:
00392     _Nfa&     _M_nfa;
00393     _StateIdT _M_start;
00394     _StateIdT _M_end1;
00395     _StateIdT _M_end2;
00396 
00397   };
00398 
00399 } // namespace __regex
00400 } // namespace std
00401 
00402 #include <bits/regex_nfa.tcc>
00403