// $Id: regexp.cc 1.7.1.1 Mon, 18 Aug 1997 17:11:28 -0700 wlee $
// 
//  Copyright (c) 1994 by the University of Southern California
//  and/or the International Business Machines Corporation.
//  All rights reserved.
//
//  Permission to use, copy, modify, and distribute this software and
//  its documentation in source and binary forms for lawful
//  non-commercial purposes and without fee is hereby granted, provided
//  that the above copyright notice appear in all copies and that both
//  the copyright notice and this permission notice appear in supporting
//  documentation, and that any documentation, advertising materials,
//  and other materials related to such distribution and use acknowledge
//  that the software was developed by the University of Southern
//  California, Information Sciences Institute and/or the International
//  Business Machines Corporation.  The name of the USC or IBM may not
//  be used to endorse or promote products derived from this software
//  without specific prior written permission.
//
//  NEITHER THE UNIVERSITY OF SOUTHERN CALIFORNIA NOR INTERNATIONAL
//  BUSINESS MACHINES CORPORATION MAKES ANY REPRESENTATIONS ABOUT
//  THE SUITABILITY OF THIS SOFTWARE FOR ANY PURPOSE.  THIS SOFTWARE IS
//  PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES,
//  INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
//  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, TITLE, AND 
//  NON-INFRINGEMENT.
//
//  IN NO EVENT SHALL USC, IBM, OR ANY OTHER CONTRIBUTOR BE LIABLE FOR ANY
//  SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES, WHETHER IN CONTRACT,
//  TORT, OR OTHER FORM OF ACTION, ARISING OUT OF OR IN CONNECTION WITH,
//  THE USE OR PERFORMANCE OF THIS SOFTWARE.
//
//  Questions concerning this software should be directed to 
//  info-ra@isi.edu.
//
//  Author(s): Cengiz Alaettinoglu <cengiz@isi.edu>

#include "config.hh"

#include "debug.hh"
#include "regexp.hh"
#include <ASMap.h>
#include <ASMacroMap.h>
#undef MAP_H
#include <map.h>
#include <cstring>


int regexp::expand_as_macros = 0;

////////////////////////////// common stuff ////////////////////

int regexp::operator==(regexp& b) {
   return rd_equal_dfa(dfa(), b.dfa());
}

int regexp::is_universal() {
   return rd_is_universal_dfa(dfa());
}

int regexp::is_empty() {
   return rd_is_empty_dfa(dfa());
}

rd_fm* regexp::dfa() {
   if (!m) {
      rd_init();
      m = re2nfa();
      m = rd_make_bol(m);
      m = rd_make_eol(m);
      m = rd_ntod(m); 
      rd_minimize(m);
   }
   return m;
}

////////////////////////////// virtual dup ////////////////////

regexp* regexp_empty_set::dup() const { 
   return new regexp_empty_set(*this);
}

regexp* regexp_bol::dup() const { 
   return new regexp_bol;
}

regexp* regexp_eol::dup() const { 
   return new regexp_eol;
}

regexp* regexp_empty_str::dup() const { 
   return new regexp_empty_str(*this);
}

regexp* regexp_symbol::dup() const { 
   return new regexp_symbol((regexp_symbol&) *this);
}

regexp* regexp_cat::dup() const { 
   return new regexp_cat(*this);
}

regexp* regexp_or::dup() const { 
   return new regexp_or(*this);
}

regexp* regexp_star::dup() const { 
   return new regexp_star(*this);
}

regexp* regexp_question::dup() const { 
   return new regexp_question(*this);
}

regexp* regexp_plus::dup() const { 
   return new regexp_plus(*this);
}

regexp* regexp_nf::dup() const { 
   return new regexp_nf(*this);
}

////////////////////////////// virtual re2nfa ////////////////////

rd_fm* regexp_empty_set::re2nfa() const {
   return rd_empty_set_machine();
}

rd_fm*  regexp_bol::re2nfa() const {
   rd_fm* m = rd_empty_string_machine();
   m->bolexp = 1;
   return m;
}

rd_fm*  regexp_eol::re2nfa() const {
   rd_fm* m = rd_empty_string_machine();
   m->eolexp = 1;
   return m;
}

rd_fm*  regexp_empty_str::re2nfa() const {
   return rd_empty_string_machine();
}

rd_fm*  regexp_cat::re2nfa() const {
   return rd_concatenate(left->re2nfa(), right->re2nfa());
}

rd_fm*  regexp_or::re2nfa() const {
   return rd_alternate(left->re2nfa(), right->re2nfa());
}

rd_fm* regexp_star::re2nfa() const {
   return rd_zero_or_more(left->re2nfa());
}

rd_fm*  regexp_question::re2nfa() const {
   return rd_zero_or_one(left->re2nfa());
}

rd_fm*  regexp_plus::re2nfa() const {
   return rd_one_or_more(left->re2nfa());
}




////////////////////////////// build regexp tree ////////////////////

regexp* regexp::cat(regexp *l, regexp *r) const {
   ASSERT(l);
   ASSERT(r);

   if (l->regexp_type == REGEXP_EMPTY_SET) {
      delete r;
      return l;
   }

   if (r->regexp_type == REGEXP_EMPTY_SET) {
      delete l;
      return r;
   }

   if (l->regexp_type == REGEXP_EMPTY_STR) {
      delete l;
      return r;
   }
   
   if (r->regexp_type == REGEXP_EMPTY_STR) {
      delete r;
      return l;
   }

   regexp_cat* re = new regexp_cat;
   re->regexp_type = REGEXP_CAT; 
   re->left = l;
   re->right = r;

   return re;
}

regexp* regexp::or(regexp *l, regexp *r) const {
   ASSERT(l);
   ASSERT(r);

   if (l->regexp_type == REGEXP_EMPTY_SET) {
      delete l;
      return r;
   }

   if (r->regexp_type == REGEXP_EMPTY_SET) {
      delete r;
      return l;
   }

   if (l->regexp_type == REGEXP_EMPTY_STR) {
      delete l;
      return question(r);
   }

   if (r->regexp_type == REGEXP_EMPTY_STR) {
      delete r;
      return question(l);
   }

   if (*l == *r) {
      delete r;
      return l;
   }

   regexp_or* re = new regexp_or;
   re->regexp_type = REGEXP_OR; 
   re->left = l;
   re->right = r;
   return re;
}

regexp* regexp::star(regexp *l) const {
   ASSERT(l);

   if (l->regexp_type == REGEXP_EMPTY_SET)
      return l;

   if (l->regexp_type == REGEXP_EMPTY_STR)
      return l;

   if (l->regexp_type == REGEXP_STAR)
      return l;

   regexp_star* re = new regexp_star;
   re->regexp_type = REGEXP_STAR; 

   if (l->regexp_type == REGEXP_QUESTION) {
      re->left = ((regexp_question *) l)->left;
      ((regexp_question *) l)->left = NULL;
      delete l;
   } else
      re->left = l;

   return re;
}

regexp* regexp::question(regexp *l) const {
   ASSERT(l);

   if (l->regexp_type == REGEXP_EMPTY_SET)
      return l;

   if (l->regexp_type == REGEXP_EMPTY_STR)
      return l;

   if (l->regexp_type == REGEXP_STAR)
      return l;

   if (l->regexp_type == REGEXP_QUESTION)
      return l;

   regexp_question* re = new regexp_question;
   re->regexp_type = REGEXP_QUESTION; 
   re->left = l;
   return re;
}

////////////////////////////// output ////////////////////

ostream& operator<<(ostream& os, regexp_symbol& rs) {
   static re_asno_t::range zero_to_zero(0, 0);

   int put_brackets = 0;

   if (rs.asmacros.empty() && ! rs.asnumbers.universal()
       && rs.asnumbers.contains(zero_to_zero)) {
      rs.complemented = ! rs.complemented;
      rs.asnumbers.complement();
   }

   if (rs.complemented 
       || ! ((rs.asnumbers.is_singleton() && rs.asmacros.length() == 0)
	     || (rs.asnumbers.empty() && rs.asmacros.length() == 1))) {
      put_brackets = 1;
      os << "[";
      if (rs.complemented)
	 os << "^";
   }

   if (!rs.asnumbers.empty() || rs.asmacros.empty())
      os << rs.asnumbers;

   for (Pix p = rs.asmacros.first(); p; rs.asmacros.next(p))
      os << " " << ASMacro_map(rs.asmacros(p));

   if (put_brackets)
      os << "]";
}

ostream& operator<<(ostream& s, const regexp_nf& rs) {
   regexp_nf::RegexpConjunct *rc;
   regexp_nf::RegexpConjunct::ReInt *ri;

   // simple re, print the regexp
   if (rs.rclist.size() == 1) {
      rc = rs.rclist.head();
      if (rc->regexs.size() == 1) {
	 ri = rc->regexs.head();
	 if (! ri->negated) {
	    s << *ri->re;
	    return s;
	 }
      }
   }

   // complex re, print the dfa
   regexp *reg = reg->construct(rs.m);
   s << "^" << *reg << "$";
   delete reg;

   return s;
}
   
ostream& operator<<(ostream& os, const regexp& r) {
   switch (r.regexp_type) {
   case REGEXP_EMPTY_SET :
      os << "E";
      break;
   case REGEXP_EMPTY_STR : 
      os << "^$";
      break;
   case REGEXP_BOL : 
      os << "^";
      break;
   case REGEXP_EOL : 
      os << "$";
      break;
   case REGEXP_SYMBOL :
      os << (regexp_symbol &) r;
      break;
   case REGEXP_STAR :
      if (((regexp_star &) r).left->regexp_type == REGEXP_CAT)
	 os << "(" << *((regexp_star &) r).left << ")*";
      else 
	 os << *((regexp_star &) r).left << "*";
      break;
   case REGEXP_QUESTION :
      if (((regexp_question &) r).left->regexp_type == REGEXP_CAT)
	 os << "(" << *((regexp_question &) r).left << ")?";
      else 
	 os << *((regexp_question &) r).left << "?";
      break;
   case REGEXP_PLUS :
      if (((regexp_plus &) r).left->regexp_type == REGEXP_CAT)
	 os << "(" << *((regexp_plus &) r).left << ")+";
      else 
	 os << *((regexp_plus &) r).left << "+";
      break;
   case REGEXP_CAT :
      os << *((regexp_cat &) r).left << " " << *((regexp_cat &) r).right;
      break;
   case REGEXP_OR :
      os << "(" << *((regexp_or &) r).left 
	 << " | " << *((regexp_or &) r).right << ")";
      break;
   case REGEXP_NF :
      os << ((regexp_nf &) r);
      break;
   case REGEXP_UNKNOWN :
   default:
      os << "REGEXP_UNKNOWN";
   }

   return os;
}

void  regexp_symbol::add(RegexpSymbol_t type, Pix p1, Pix p2) {
   int as1, as2;

   switch (type) {
   case REGEXP_SYMBOL_AS_NO:
      as1 = atoi(AS_map(p1) + 2);
      as2;
      if (p2)
	 as2 = atoi(AS_map(p2) + 2);
      else
	 as2 = as1;
      asnumbers.add(as1, as2);
      break;
   case REGEXP_SYMBOL_AS_MACRO:
      asmacros.add(p1);
      break;
   case REGEXP_SYMBOL_ANY:
      asnumbers.add(0, MAX_AS);
      break;
   default:
      break;
   }
}

rd_fm* regexp_symbol::re2nfa() const {
   rd_dq *rdr = rd_alloc_range_list_empty();

   for (re_asno_t::range *r = asnumbers.ranges.head(); 
	r; r = asnumbers.ranges.next(r->ranges))
      rd_insert_range(rdr, rd_alloc_range(r->low, r->high));

   for (Pix p = asmacros.first(); p; asmacros.next(p)) {
      _SetOfPix& as_set = ASMacro_map.expand(asmacros(p));
      
      for (Pix pi = as_set.first(); pi; as_set.next(pi)) {
	 int as1 = atoi((char *) AS_map(as_set(pi)) + 2);
	 rd_insert_range(rdr, rd_alloc_range(as1, as1));
	 if (expand_as_macros)
	    ((re_asno_t &) asnumbers).add(as1, as1);
      }
   }

   if (expand_as_macros)
      ((_SetOfPix &) asmacros).clear();

   if (complemented)
      rd_complement_range_list(rdr);

   rd_fm* m = rd_singleton(rdr);

   return m;
}

rd_fm* regexp_nf::re2nfa() const {
   ASSERT(0);
   return m;
}

////////////////////////////// fsa to regexp conversion /////////////////////

#define state rd_state

struct int2 {
   int2() {}
   int2(state* ii, state* jj) : i(ii), j(jj) {}
   state *i, *j;

   struct less {
      int operator() (const int2& a, const int2& b) const {
	 return a.i <  b.i ||  a.i == b.i && a.j <  b.j;
      }
   };
};

void pmap(map<int2, regexp*, int2::less> &fmtore_map) {
   map<int2, regexp*, int2::less>::iterator pi;

   for (pi = fmtore_map.begin();
	pi != fmtore_map.end();
	++pi)
      cerr << "map " << (*pi).first.i << " " << (*pi).first.j << " " << *(*pi).second << "\n";

}

regexp* regexp::construct(rd_fm *fm) const {
   map<int2, regexp*, int2::less> fmtore_map;
   map<int2, regexp*, int2::less>::iterator pi, qi, si;

   regexp *prefix, *suffix, *middle;
   rd_state *stt;
   rd_arc *arc;
   regexp_symbol *r_sym;

   // initialize fmtore_map from fm
   RDQ_LIST_START(&(fm->rf_states), fm, stt, rd_state) {
      RDQ_LIST_START(&(stt->rs_arcs), stt, arc, rd_arc) {
	 int2 i2(stt, arc->ra_to);

	 pi = fmtore_map.find(i2);
	 if (pi == fmtore_map.end())
	    fmtore_map[i2] = r_sym = new regexp_symbol;
	 else
	    r_sym = (regexp_symbol *) (*pi).second;
	 r_sym->add(arc->ra_low, arc->ra_high);
	 
      } RDQ_LIST_END(&(stt->rs_arcs), stt, arc, rd_arc);
   } RDQ_LIST_END(&(fm->states), fm, stt, rd_state);

   // make two states;
   state start;
   state final;

   fmtore_map[int2(&start, fm->rf_start)] = new regexp_empty_str;

   RDQ_LIST_START(&(fm->rf_final), fm, stt, rd_state) {
      fmtore_map[int2(stt, &final)] = new regexp_empty_str;
   } RDQ_LIST_END(&(fm->rf_final), fm, stt, rd_state);

   RDQ_LIST_START(&(fm->rf_states), fm, stt, rd_state) {// eliminate each state
      // pmap(fmtore_map);
      // make self looping middle
      if ((pi = fmtore_map.find(int2(stt, stt))) != fmtore_map.end()) {
	 middle = (*pi).second;
	 middle = star(middle);
	 fmtore_map.erase(pi);
      } else
	 middle = new regexp_empty_str;

//      cerr << "Eliminating " << next[i].value() << " middle " << *middle << "\n";

      for (pi = fmtore_map.begin(); pi != fmtore_map.end(); ) {
	 if ((*pi).first.j == stt) {
	    prefix = cat((*pi).second, middle->dup());
//	    cerr << "arc from " << (*pi).first.i << " q " << *q << "\n";
	    for (qi = fmtore_map.lower_bound(int2(stt, 0));
		 qi != fmtore_map.end() && (*qi).first.i == stt;
		 ++qi) {
	       suffix = cat(prefix->dup(), (*qi).second->dup());
//	       cerr << " into " << (*qi).first.j << " p " << *p << "\n";
	       if ((si = fmtore_map.find(int2((*pi).first.i, (*qi).first.j))) != fmtore_map.end())
		  (*si).second = or((*si).second, suffix);
	       else
		  fmtore_map[int2((*pi).first.i, (*qi).first.j)] = suffix;
	    }
	    delete prefix;
	    si = pi;
	    ++pi;
	    fmtore_map.erase(si);
	 } else
	    ++pi;
      }
      for (qi = fmtore_map.lower_bound(int2(stt, 0));
	   qi != fmtore_map.end() && (*qi).first.i == stt; 
	 ) {
	 delete (*qi).second;
	 si = qi;
	 ++qi;
	 fmtore_map.erase(si);
      }
	 
     delete middle;
   } RDQ_LIST_END(&(fm->states), fm, stt, rd_state);

   // check for empty string
   if (RD_ACCEPTS_EMPTY_STRING(fm))
      return question(fmtore_map[int2(&start, &final)]);
   else
      return fmtore_map[int2(&start, &final)];
}


//////////////////////////////////////// NF //////////////////////////////

void regexp_nf::become_universal() {
   rclist.clear();

   if (m)
      rd_free_dfa(m);
   m = rd_empty_set_dfa();
   rd_complement_dfa(m);
}

void regexp_nf::become_empty() {
   rclist.clear();

   if (m)
      rd_free_dfa(m);
   m = rd_empty_set_dfa();
}

regexp_nf::regexp_nf(const regexp_nf& s) {
   RegexpConjunct *rc, *rc2;
   RegexpConjunct::ReInt *ri, *ri2;

   for (rc = s.rclist.head(); rc; rc = s.rclist.next(rc->rclist)) {
      rc2 = new RegexpConjunct(*rc);
      rclist.append(rc2->rclist);
   }
   
   m = rd_duplicate_dfa(s.m);
}

regexp_nf::RegexpConjunct::RegexpConjunct(const RegexpConjunct &s) : 
   rclist(this) {
   mark    = s.mark;

   RegexpConjunct::ReInt *ri, *ri2;

   for (ri = s.regexs.head(); ri; ri = s.regexs.next(ri->regexs)) {
      ri2 = new RegexpConjunct::ReInt(*ri);
      regexs.append(ri2->regexs);
   }
}

void regexp_nf::do_or(regexp_nf &b) {
   if (b.is_empty() || is_universal())
      return;

   if (b.is_universal()) {
      become_universal();
      return;
   }
   
   rd_fm *_m   = rd_duplicate_dfa(m);
   rd_fm *_b_m = rd_duplicate_dfa(b.m);

   rd_init();
   rd_dton(_m);
   rd_dton(_b_m);

   _m = rd_alternate(_m, _b_m);
   _m = rd_ntod(_m);
   rd_minimize(_m);

   if (rd_equal_dfa(m, _m)) // union is same as us
      ;
   else if (rd_equal_dfa(b.m, _m)) { // union is same as b
      rclist.clear();
      rclist.splice(b.rclist);
      b.become_empty();
   } else { // union is new!
      rclist.splice(b.rclist); 
      b.become_empty();
   }

   rd_free_dfa(m); /* works with dfa too */ 
   m = _m;

   if (rd_is_universal_dfa(m))
      rclist.clear();
}

void regexp_nf::do_and(regexp_nf &b) {
   if (b.is_universal() || is_empty())
      return;

   if (b.is_empty()) {
      become_empty();
      return;
   }

   rd_fm *m3 = rd_intersect_dfa(m, b.m);

   if (rd_equal_dfa(m, m3)) // intersection is same as us
      ;
   else if (rd_equal_dfa(b.m, m3)) { // intersection is same as b
      rclist.clear();
      rclist.splice(b.rclist);
      b.become_empty();
   } else { // intersection is new!
      do_and_terms(b);
      b.become_empty();
   }

   rd_free_dfa(m); /* works with dfa too */ 
   m = m3;

   if (rd_is_empty_dfa(m))
      rclist.clear();
}

void regexp_nf::do_and_terms(regexp_nf &b) {
   ListHead<RegexpConjunct> tmp;
   RegexpConjunct *rc1, *rc2, *rc3, *tmp2;

   if (rclist.is_empty()) {
      rclist.splice(b.rclist);
      return;
   }

   for (rc1 = rclist.head(); rc1; rc1 = rclist.next(rc1->rclist))
      for (rc2 = b.rclist.head(); rc2; rc2 = b.rclist.next(rc2->rclist)) {
	 rc3 = new RegexpConjunct;
	 tmp2 = new RegexpConjunct(*rc1);
	 rc3->regexs.splice(tmp2->regexs);
	 delete tmp2;
	 tmp2 = new RegexpConjunct(*rc2);
	 rc3->regexs.splice(tmp2->regexs);
	 delete tmp2;
	 tmp.append(rc3->rclist);
      }

   rclist.clear();
   b.rclist.clear();
   rclist.splice(tmp);
}

void regexp_nf::do_not() {
   if (is_universal()) {
      become_empty();
      return;
   }
    
   if (is_empty()) {
      become_universal();
      return;
   }
   
   // complement machine
   rd_complement_dfa(m);

   // complement terms
   regexp_nf tmp, tmp2;
   RegexpConjunct *rc1, *rc2, *rc3;
   RegexpConjunct::ReInt *ri1, *ri2;

   tmp.become_universal();
   for (rc1 = rclist.head(); rc1; rc1 = rclist.next(rc1->rclist)) {
      tmp2.become_empty();
      for (ri1 = rc1->regexs.head(); ri1; ri1 = rc1->regexs.next(ri1->regexs)) {
	 ri2 = new RegexpConjunct::ReInt(*ri1);
	 ri2->negated = ~ri2->negated;
	 rc2 = new RegexpConjunct;
	 rc2->regexs.append(ri2->regexs);
	 tmp2.rclist.append(rc2->rclist);
      }
      tmp.do_and_terms(tmp2);
   }

   tmp2.rclist.clear();
   rclist.clear();
   rclist.splice(tmp.rclist);	 
}

//////////////////////////////////// match ///////////////////////////////////

#include "dbase.hh"

template class ListHead<ListNodePix>;

inline rd_state *rd_next_state(rd_fm *fm, rd_state *rs, unsigned int as) {
   rd_arc	*ra;		/* Current arc we're at */

   RDQ_LIST_START(&(rs->rs_arcs), rs, ra, rd_arc) {
      if (ra->ra_low <= as && as <= ra->ra_high)
	 return ra->ra_to;
      if (ra->ra_low > as) // note that the list is sorted
	 return NULL;
   } RDQ_LIST_END(&(rs->rs_arcs), rs, ra, rd_arc);

   return NULL;
}

int regexp::match(ListHead<ListNodePix>& path) {
   rd_state	*rs;		/* Current state */
   unsigned int cas;		/* Current AS */

   ListNodePix *p;

   dfa();

   rs = m->rf_start;
   for (p = path.head(); p; p = path.next(p->l)) {
      cas = atoi(AS_map(p->pix) + 2); // we assume as numbers start with 'AS'

      rs = rd_next_state(m, rs, cas);

      if (!rs || (rs->rs_flags & RSF_REJECT)) {
	 return FALSE;
      }
   }

   return RD_IS_FINAL(rs);
}
