(*  Title: 	Lexicon
    Author: 	Tobias Nipkow
*)

signature LEXICON =
sig
  type Lexicon
  datatype Token = Token of int | IdentSy of string | VarSy of string * int |
                   end_token;
  val end_token: Token
  val mk_lexicon: string list -> Lexicon
  val tokenize: Lexicon -> string -> Token list
  val token_to_string: Lexicon -> Token -> string
  val scan_varname: string list -> (string*int) * string list
  val string_of_vname: string * int -> string
  val matching_tokens: Token * Token -> bool
  val valued_token: Token -> bool
  val predef_term: string -> Token
  type 'b TokenMap
  val mkTokenMap: ('b * Token list) list -> 'b TokenMap
  val applyTokenMap: 'b TokenMap * Token -> 'b list
end;

functor LEXICON_FUN(Syntax_Def:SYNTAX_DEF) : LEXICON =
struct

datatype Token = Token of int
	       | IdentSy of string
	       | VarSy of string * int
	       | end_token;
val no_token = ~1;
val first_token = 0;

datatype State = State of string * int * (State list);
type DFA = State list;
type TokenTab = string list;
datatype Lexicon = Lexicon of {DFA:DFA, KeyWds:int list, TokenTab:TokenTab};

fun mk_lexicon(sl:string list) : Lexicon =
    let fun part (c,sll) = let val (sll1,sll2) =
                                   partition ((apl(c,op=)) o hd) sll;
                           in (map tl sll1, sll2) end;

        fun mk([]:string list list,_:int):DFA = [] |
            mk([c]::sll,i) = let val (cl,ncl) = part(c,sll)
                             in State(c,i,mk(cl,i+1))::
                                mk(ncl,i+length(cl)+1) end |
            mk((c::sl)::sll,i) = let val (cl,ncl) = part(c,sll)
                                 in State(c,no_token,mk(sl::cl,i))::
                                    mk(ncl,i+length(cl)+1) end;

        fun is_id(c::cs) = is_letter(c) andalso forall is_quasi_letter cs;
        fun mk_kw([],_) = [] |
            mk_kw(s::sl,i) = (if is_id(explode s) then [i] else [])
                             @ mk_kw(sl,i+1);

        val ds = distinct(sort(op<)sl);
    in Lexicon {DFA = mk(map explode ds,first_token),
                KeyWds = mk_kw(ds,first_token), TokenTab = ds} end;

fun next_state(dfa:DFA,c:string): State option =
    let fun next [] = None |
            next ((st as State(c',_,_))::stl) =
                 if c=c' then Some(st) else next stl
    in next dfa end;

exception LEX_ERR;

fun is_qld c = is_quasi_letter c orelse is_digit c;

(*A string of letters or ' _ but no digits!
  Identifiers may not contain digits because trailing digits indicate
    an offset to a variable or param name*)
fun scan_ident []  =  error("End of input; identifier expected.\n")
  | scan_ident(c::cs) =  
      if  is_letter c  then
        let val (ds,tail) = take_prefix is_qld cs
	in  (implode(c::ds), tail)  end
      else error("Identifier expected: " ^ implode(c::cs) ^ "\n");

(*Scan the offset of a Var, if present; otherwise ~1 *)
fun scan_offset cs = case cs of
    ("."::[]) => (~1,cs)
  | ("."::(ds as c::cs')) => if is_digit c then scan_int ds else (~1,cs)
  | _ => (~1,cs);

fun split_varname s =
    let val (rpost,rpref) = take_prefix is_digit (rev(explode s))
        val (i,_) = scan_int(rev rpost)
    in (implode(rev rpref), i) end;

fun scan_varname cs : (string*int) * string list =
  let val (a, ds) = scan_ident cs;
      val (i, es) = scan_offset ds
  in if i = ~1 then (split_varname a, es) else ((a,i), es) end;

fun isin_sorted_list (i:int) =
    let fun isin(j::l) = if i<j then false else
                         if i=j then true else isin l |
            isin([]) = false
    in isin end;

fun tokenize (Lexicon{DFA=dfa,KeyWds=kw,...}) (s:string) : Token list =
    let fun tokenize1 (_:DFA,[]:string list) : (Token * (string list)) =
              raise LEX_ERR |
            tokenize1(dfa,c::sl) =
              case next_state(dfa,c) of
                None => raise LEX_ERR |
                Some(State(_,t,dfa')) =>
                  if t=no_token then tokenize1(dfa',sl)
                  else (tokenize1(dfa',sl) handle LEX_ERR =>
                        if isin_sorted_list t kw andalso not(null sl) andalso
                           is_quasi_letter(hd sl)
                        then raise LEX_ERR else (Token(t),sl));

        fun tknize [] = [] |
            tknize (l as c::cl) =
              if is_blank(c) then tknize(cl) else
		let val (t,rest) = tokenize1(dfa,l)
			handle LEX_ERR =>
			if is_letter(c)
			then let val (id,rest) = scan_ident l
			     in (IdentSy(id),rest) end
			else if c = "?"
			then let val (v,rest) = scan_varname cl
			     in (VarSy(v), rest) end
			else error("Cannot scan "^(implode l)^"\n")
		in t::tknize(rest) end
    in tknize (explode s) end;

fun string_of_vname (a,idx) = if is_digit(hd(rev(explode a)))
	then a ^ "." ^ string_of_int idx
	else if idx = 0 then a else a ^ string_of_int idx

fun token_to_string (Lexicon{TokenTab=tt,...}) (Token(i):Token) : string =
      nth_elem(i,tt) |
    token_to_string _ (IdentSy(s)) = s |
    token_to_string _ (VarSy xn) = "?" ^ string_of_vname xn |
    token_to_string _ end_token = "\n";

fun matching_tokens(Token(i),Token(j)) = (i=j) |
    matching_tokens(IdentSy(_),IdentSy(_)) = true |
    matching_tokens(VarSy(_,_),VarSy(_,_)) = true |
    matching_tokens(end_token,end_token) = true |
    matching_tokens(_,_) = false;

fun valued_token(end_token) = false |
    valued_token(Token(_)) = false |
    valued_token(IdentSy(_)) = true |
    valued_token(VarSy(_,_)) = true;

fun predef_term name =
      if Ground name = Syntax_Def.SId then IdentSy("") else
      if Ground name = Syntax_Def.SVar then VarSy("",0) else
      end_token;

type 'b TokenMap = (int * 'b list) list * 'b list;

val idToken = first_token - 1;
val varToken = idToken - 1;
val endToken = varToken - 1;

fun int_of_token(Token(tk)) = tk |
    int_of_token(IdentSy _) = idToken |
    int_of_token(VarSy _) = varToken |
    int_of_token(end_token) = endToken;
(*
fun token_of_int i =
      if i=idToken then IdentSy("") else
      if i=varToken then VarSy("",0) else
      if i=endToken then end_token else Token(i);
*)
fun mkTokenMap(atll) =
    let val aill = map (fn(a,tl)=>(a,map int_of_token tl)) atll;
        val dom = sort op< (distinct(flat(map snd aill)));
        val mt = map fst (filter (null o snd) atll);
        fun mktm(i) =
            let fun add(l,(a,il)) = if i mem il then a::l else l
            in (i,itlist_left add ([],aill)) end;
    in (map mktm dom,mt) end;

fun find_al (i:int) =
    let fun find((j,al)::l) = if i<j then [] else
                              if i=j then al else find l |
            find [] = [];
    in find end;
fun applyTokenMap((l,mt),tk) = mt@(find_al (int_of_token tk) l);

end;
