/*	Copyright (C) 1992 Free Software Foundation, Inc.

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this software; see the file COPYING.  If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.  */

#include "rxparse.h"
#include "ctype.h"

#ifndef isgraph
#define isgraph(c) (isprint (c) && !isspace (c))
#endif
#ifndef isblank
#define isblank(c) ((c) == ' ' || (c) == '\t')
#endif

/* These are the parameters which are `global' to an entire parse. */
struct parser_data
{
  int lparenc;
  int registerc;
  ut_Bitset syntax_groups;
  const char * pat_end;
};

static struct rexp_node *parse_concats P_ ((struct rx_buf *,
					    const char **,
					    struct parser_data *,
					    int, int));
static struct rexp_node *parse_iteration P_ ((struct rx_buf *, const char **,
					      struct parser_data *,
					      int));
static struct rexp_node *parse_unit P_ ((struct rx_buf *, const char **,
					 struct parser_data *,
					 int));
static struct rexp_node *parse_alts P_ ((struct rx_buf * rxb,
					 const char ** pat,
					 struct parser_data *,
					 int istop));


/* This is the top level entry point to a recursive descent regexp regex. */

struct rexp_node *
parse_grexp (rxb, pat, len, syntax_groups)
     struct rx_buf *rxb;
     const char ** pat;
     int len;
     ut_Bitset syntax_groups;
{
  const char * pat_end = *pat + len;
  struct parser_data pd;
  struct rexp_node * rn;
  ut_Bitset allocated_syntax_groups = 0;
  pd.lparenc = 0;
  pd.registerc = 0;
  pd.syntax_groups = syntax_groups;
  pd.pat_end = pat_end;
  rn = parse_alts (rxb, pat, &pd, 1);
  if (allocated_syntax_groups)
    free (allocated_syntax_groups);
  return rn;
}



#define EOP_CHECK  if (*pat > pd->pat_end) return 0
#define EOP_PRED  (*pat > pd->pat_end)

/* MATCH_OP matches operators.  It knows whether to look for a backslash or
   not.  As a post-condition, the pattern pointer is advanced past the end of
   the operator.  Many of the distinctions implied by syntax bits are enforced
   here. */ 


static int 
match_op (rxb, pat, pd, op)
     struct rx_buf *rxb;
     const char ** pat;
     struct parser_data * pd;
     char op;
{
  const char *p = *pat;
  EOP_CHECK;
  switch (op)
    {
    case '*':
    case '[':
    case ']':
    case '.':
    case '^':
    case '$':
      if (**pat == op)
	{
	  ++(*pat);
	  return 1;
	}
      else
	return 0;

    case '|':
      /* Make sure that | is allowed and present. */
      if ((rxb->syntax && RE_LIMITED_OPS))
	return 0;
      if ((**pat == '\n' && (rxb->syntax & RE_NEWLINE_ALT))
	  || ((!(rxb->syntax & RE_NO_BK_VBAR) && *p++ == '\\' && !EOP_PRED)
	      && (*p++ == '|')))
	{
	  *pat = p;
	  return 1;
	}
      return 0;

    case '?':
    case '+':
      if (rxb->syntax & RE_LIMITED_OPS)
	return 0;
      if ((rxb->syntax & RE_BK_PLUS_QM) && *p++ != '\\' && !EOP_PRED)
	return 0;
      if (*p++ == op)
	{
	  *pat = p;
	  return 1;
	}
      return 0;
    case '{':
    case '}':
      if (!(rxb->syntax & RE_INTERVALS))
	return 0;
      if (!(rxb->syntax & RE_NO_BK_PARENS) && *p++ != '\\' && !EOP_PRED)
	return 0;
      if (*p++ == op)
	{
	  *pat = p;
	  return 1;
	}
      return 0;

    case '(':
    case ')':
      if (!(rxb->syntax & RE_NO_BK_PARENS) && *p++ != '\\' && !EOP_PRED)
	return 0;
      if (*p++ == op)
	{
	  *pat = p;
	  return 1;
	}
      return 0;
    }
}

/* LOOKING_AT_OP is MATCH_OP without side effects. */
static int 
looking_at_op (rxb, p, pd, op)
     struct rx_buf *rxb;
     char *p;
     struct parser_data * pd;
     char op;
{
  return match_op (rxb, &p, pd, op);
}

/* Posix has errors for ill formed lexemes.  GNU requires context sensative
   errors. BAD_PATTERP_RXB tries to help gnu be more specific than REG_BADBAT
   by looking for lexical errors to fit POSIX error codes. */
static int 
bad_pattern_error (rxb, pat, pd)
     struct rx_buf * rxb;
     const char ** pat;
     struct parser_data * pd;
{
  return (((*pat + 1 == pd->pat_end) && ((*pat)[0] == '\\'))
	  ? REG_EESCAPE : REG_BADPAT);
}


/*
 * PARSE_ALTS parses alternations (`\|').
 * ISTOP is true only when parse_alts is called from outside of any
 * kind of parens.
 */

static struct rexp_node *
parse_alts (rxb, pat, pd, istop)
     struct rx_buf *rxb;
     const char ** pat;
     struct parser_data * pd;
     int istop;
{
  struct rexp_node *left;
  struct rexp_node *right;
  struct rexp_node *alt;

  left = parse_concats (rxb, pat, pd, 1,
			(istop
			 && (rxb->syntax & RE_UNMATCHED_RIGHT_PAREN_ORD)));

  if (!left)
    return 0;

  if (EOP_PRED || (!istop && looking_at_op (rxb, *pat, pd, ')')))
    return left;

  if (!match_op (rxb, pat, pd, '|'))
    {
      rxb->error = bad_pattern_error (rxb, pat, pd);
      free_rexp (left);
      return 0;
    }

  right = parse_alts (rxb, pat, pd, istop);

  if (rxb->error || !right)
    {
      if (!rxb->error)
	rxb->error = REG_EEND;
      free_rexp (left);
    }

  alt = rexp_node (&rxb->rx, r_alternate);

  if (!alt)
    {
      free_rexp (left);
      free_rexp (right);
      return 0;
    }

  alt->params.pair.left = left;
  alt->params.pair.right = right;
  return alt;
}



/*
 * PARSE_CONCATS handles the implicit concatenation operator.
 * RPAREN_NORMAL says whether an unbalanced right paren is an error
 * or a normal (literal) character.
 * BOP_CONTEXT says whether we are at the beginning of a group, alternation
 * branch, or group.
 */

static struct rexp_node *
parse_concats (rxb, pat, pd, bop_context, rparen_normal)
     struct rx_buf *rxb;
     const char ** pat;
     struct parser_data * pd;
     int bop_context;
     int rparen_normal;
{
  struct rexp_node *left;
  struct rexp_node *right;
  struct rexp_node *concat;

  left = parse_iteration (rxb, pat, pd, bop_context);

  if (!left)
    return 0;

  if (EOP_PRED || looking_at_op (rxb, *pat, pd, '|') ||
      (!rparen_normal && looking_at_op (rxb, *pat, pd, ')')))
    return left;

  right = parse_concats (rxb, pat, pd, 0, rparen_normal);

  if (rxb->error)
    free_rexp (left);
  if (!right)
    return left;

  concat = rexp_node (&rxb->rx, r_concat);
  if (!concat)
    {
      free_rexp (left);
      free_rexp (right);
      return 0;
    }
  concat->params.pair.left = left;
  concat->params.pair.right = right;
  return concat;
}


static struct rexp_node *
rexp_node_star (rxb, item)
     struct rx_buf *rxb;
     struct rexp_node *item;
{
  struct rexp_node *star = rexp_node (&rxb->rx, r_star);
  if (!star)
    {
      free_rexp (item);
      return 0;
    }
  star->params.pair.left = item;
  star->params.pair.right = item;
  return star;
}

/* PARSE_ITERATION catches * + ? and {} */
static struct rexp_node *
parse_iteration (rxb, pat, pd, bop_context)
     struct rx_buf *rxb;
     const char ** pat;
     struct parser_data * pd;
     int bop_context;
{
  struct rexp_node *item;

  item = parse_unit (rxb, pat, pd, bop_context);
  if (!item)
    return 0;
#if 0
next:
  if (match_op (rxb, pat, pd, '*'))
    {
      item = rexp_node_start (rxb, item);
      goto next;
    }
  else if (match_op (rxb, pat, pd, '+'))
    {
      struct rexp_node *concat = rexp_node (&rxb->rx, r_concat);
      struct rexp_node *star = rexp_node (&rxb->rx, r_star);
      struct rexp_node *item_copy = copy_rexp (&rxb->rx, item);
      if (!(concat && star && item_copy))
	{
	  if (concat)
	    free_rexp (concat);
	  if (star)
	    free_rexp (star);
	  if (item_copy)
	    free_rexp (item_copy);
	  return 0;
	}
      star->params.pair.left = item;
      concat->params.pair.left = item_copy;
      concat->params.pair.right = star;
      item = concat;
      goto next;
    }
  else if (match_op (rxb, pat, pd, '?'))
    {
      struct rexp_node *opt = rexp_node (&rxb->rx, r_opt);
      if (!opt)
	{
	  free_rexp (item);
	  return 0;
	}
      opt->params.pair.left = item;
      item = opt;
      goto next;
    }
#endif
  return item;
}



/* This is to make sure we have actual functions (rather than macros)
 * for ctypes.h  The functions defined here have names like `alphap' for
 * ctype's `is_alpha'. 
 */
#define char_test_fn(name, predicate)		\
static int name(c) unsigned char c; { return predicate(c); }

char_test_fn (alnump, isalnum)
char_test_fn (alphap, isalpha)
char_test_fn (blankp, isblank)
char_test_fn (cntrlp, iscntrl)
char_test_fn (digitp, isdigit)
char_test_fn (graphp, isgraph)
char_test_fn (lowerp, islower)
char_test_fn (printp, isprint)
char_test_fn (punctp, ispunct)
char_test_fn (spacep, isspace)
char_test_fn (upperp, isupper)
char_test_fn (xdigitp, isxdigit)

struct char_class
{
  char * name;
  int (*pred)();
};

/* These are the named character classes specified by POSIX. */
static struct char_class char_class[] =
{
  { "alnum", alnump },
  { "alpha", alphap },
  { "blank", blankp },
  { "cntrl", cntrlp },
  { "digit", digitp },
  { "graph", graphp },
  { "lower", lowerp },
  { "print", printp },
  { "punct", punctp },
  { "space", spacep },
  { "upper", upperp },
  { "xdigit", xdigitp },
};
#define NUM_CHARCLASSES (sizeof(char_class) / sizeof(struct char_class))


/* This matches something like "alnum:]", i.e. the last part of a character
 * class. 
 */
static int match_class (rxb, pos, pd, name)
     struct rx_buf * rxb;
     char ** pos;
     struct parser_data * pd;
     char * name;
{
  char * p = *pos;
  while (*name && (p < pd->pat_end))
    if (*name++ != *p++)
      return 0;
  if (*name)
    return 0;
  if (p + 1 > pd->pat_end)
    return 0;
  if (*p++ != ':') return 0;
  if (*p++ != ']') return 0;
  *pos = p;
  return 1;
}

/* This builds a character class from its predicate function. */
static void init_class (rxb, cs, pred)
     struct rx_buf * rxb;
     ut_Bitset cs;
     int (*pred)();
{
  int ch;
  for (ch = 0; ch < rxb->rx.local_cset_size; ++ch)
    if (pred(ch))
      UT_bitset_enjoin (cs, ch);
}

static struct rexp_node *
rexp_node_any (rxb)
     struct rx_buf *rxb;
{
  ut_Bitset cs = cset (&rxb->rx);
  struct rexp_node *unit;
  if (!cs)
    return 0;
  unit = rexp_node (&rxb->rx, r_cset);
  if (!unit)
    {	  free_cset (cs);
	  return 0;
    }
  ut_bitset_universe (rxb->rx.local_cset_size, cs);
  if (!(rxb->syntax && RE_DOT_NEWLINE))
    UT_bitset_remove (cs, '\n');
  if (!(rxb->syntax && RE_DOT_NOT_NULL))
    UT_bitset_remove (cs, 0);
  unit->params.cset = cs;
  return unit;
}

/* PARSE_UNIT catches characters, character sets, groups, ^ $ and other
 *  special epsilons. 
 */

static struct rexp_node *
parse_unit (rxb, pat, pd, bop_context)
     struct rx_buf *rxb;
     const char ** pat;
     struct parser_data * pd;
     int bop_context;
{
  struct rexp_node *unit = 0;
  int is_hat;

  if (!**pat)
    return 0;

  if (match_op (rxb, pat, pd, '('))
    {
      struct rexp_node *subexp = parse_alts (rxb, pat, pd, 0);
      struct rexp_node *rparen = 0;
      struct rexp_node *lparen = 0;
      struct rexp_node *concat = 0;
      int group_num = pd->lparenc++;
      int reg_num;
      if (rxb->error || !subexp)
	{
	  if (!rxb->error)
	    rxb->error = REG_BADPAT;
	  return 0;
	}
      if (!match_op (rxb, pat, pd, ')'))
	{
	  rxb->error = REG_EPAREN;
	  goto group_error;
	}
      if (pd->syntax_groups && UT_bitset_member (pd->syntax_groups, group_num))
	return subexp;
      unit = rexp_node (&rxb->rx, r_concat);
      if (!unit)
	goto group_mem_error;
      /* The old regex starts numbering parens with `1', hence the pre++. */
      reg_num = ++pd->registerc;
      concat = rexp_node (&rxb->rx, r_concat);
      if (!concat)
	goto group_mem_error;
      rparen = rexp_node (&rxb->rx, r_side_effect);
      if (!rparen)
	goto group_mem_error;
      lparen = rexp_node (&rxb->rx, r_side_effect);
      if (!lparen)
	goto group_mem_error;

      unit->params.pair.left = lparen;
      unit->params.pair.right = concat;
      concat->params.pair.left = subexp;
      concat->params.pair.right = rparen;
      lparen->params.side_effect = (void *)(regex_lparen_1 + reg_num);
      rparen->params.side_effect = (void *)(regex_rparen_1 + reg_num);
      return unit;

    group_mem_error:
      rxb->error = REG_ESPACE;
    group_error:
      if (unit)
	free_rexp (unit);
      if (concat)
	free_rexp (concat);
      if (rparen)
	free_rexp (rparen);
      if (lparen)
	free_rexp (lparen);
      return 0;
    }

#if 0
  else if (((rxb->syntax & RE_CONTEXT_INVALID_OPS)
	    || (rxb->syntax & RE_CONTEXT_INDEP_OPS))
	   && bop_context
	   && (match_op (rxb, pat, pd, '*')
	       || match_op (rxb, pat, pd, '+')
	       || match_op (rxb, pat, pd, '?')
	       || match_op (rxb, pat, pd, '{')))
    {
      rxb->error = REG_BADRPT;
      return 0;
    }
#endif
  else if ((bop_context || (rxb->syntax & RE_CONTEXT_INDEP_ANCHORS))
	   && ((is_hat = match_op (rxb, pat, pd, '^'))
	       || match_op (rxb, pat, pd, '$')))
    {
      unit = rexp_node (&rxb->rx, r_side_effect);
      if (!unit)
	return 0;
      unit->params.side_effect = (void *)(is_hat ? regex_hat : regex_dollar);
      return unit;
    }

  else if (match_op (rxb, pat, pd, '['))
    {
      int complement = (**pat == '^');
      int charclass =
	!complement && (rxb->syntax & RE_CHAR_CLASSES) && (**pat == ':');
      ut_Bitset cs = cset (&rxb->rx);
      if (complement || charclass) ++*pat;
      if (!cs)
	return 0;
      unit = rexp_node (&rxb->rx, r_cset);
      if (!unit)
	{
	  free_cset (cs);
	  return 0;
	}
      if (charclass)
	{
	  int x;
	  for (x = 0; x < NUM_CHARCLASSES; ++x)
	    {
	      if (match_class (rxb, pat, pd, char_class[x].name))
		{
		  init_class (rxb, cs, char_class[x].pred);
		  goto got_cs;
		}
	    }
	}

      EOP_CHECK;
      if (**pat == ']')
	{
	  UT_bitset_enjoin (cs, ']');
	  ++*pat;
	}
      while (1)
	{
	  EOP_CHECK;
	  if (**pat == ']')
	    {
	      ++*pat;
	      break;
	    }
	  if ((**pat == '\\')
	      && (rxb->syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
	    ++*pat;
	  if (*pat + 3 > pd->pat_end)
	    {
	      free_cset (cs);
	      free_rexp (unit);
	      rxb->error = REG_EBRACK;
	      return 0;
	    }
	  if (((*pat)[1] == '-') && ((*pat)[2] != ']'))
	    {
	      int low = **pat;
	      int high = (*pat)[2];
	      *pat += 3;
	      if ((low > high) && (rxb->syntax & RE_NO_EMPTY_RANGES))
		{
		  free_cset (cs);
		  free_rexp (unit);
		  rxb->error = REG_ERANGE;
		  return 0;
		}
	      while (low <= high)
		UT_bitset_enjoin (cs, low++);
	      continue;
	    }
	  UT_bitset_enjoin (cs, **pat);
	  ++*pat;
	}

    got_cs:
      if (complement)
	{
	  ut_bitset_complement (rxb->rx.local_cset_size, cs);
	  if (rxb->syntax & RE_HAT_LISTS_NOT_NEWLINE)
	    UT_bitset_remove (cs, '\n');
	}
      unit->params.cset = cs;
      return unit;
    }

#if 0
  else if (match_op (rxb, pat, pd, '.'))
    return rexp_node_any (rxb);
#else
  else if (match_op (rxb, pat, pd, '?'))
    return rexp_node_any (rxb);
  else if (match_op (rxb, pat, pd, '*'))
    {
      if (looking_at_op (rxb, *pat, pd, '('))
	unit = parse_unit (rxb, pat, pd, bop_context);
      else
        unit = rexp_node_any (rxb);
      return rexp_node_star (rxb, unit);
    }
#endif
  else
    {
      ut_Bitset cs;
      EOP_CHECK;
      if (**pat == '\\')
	{
	  ++*pat;
	  EOP_CHECK;
	  if (isdigit(**pat) && !(rxb->syntax & RE_NO_BK_REFS))
	    {
	     unit = rexp_node (&rxb->rx, r_side_effect);
	     if (!unit)
	       return 0;
	     unit->params.side_effect =
	       ((void *)(regex_backreference_1 + (**pat) - '0'));
	     ++*pat;
	     return unit;
	    }
	}
      cs = cset (&rxb->rx);
      if (!cs)
	return 0;
      unit = rexp_node (&rxb->rx, r_cset);
      if (!unit)
	{
	  free_cset (cs);
	  return 0;
	}
      UT_bitset_enjoin (cs, **pat);
      ++(*pat);
      unit->params.cset = cs;
      return unit;
    }
}
