# ACTIVESTATE TEAPOT-PKG BEGIN TM -*- tcl -*-
# -- Tcl Module

# @@ Meta Begin
# Package grammar::aycock 1.0
# Meta as::build::date 2015-05-25
# Meta as::origin      http://sourceforge.net/projects/tcllib
# Meta category        Aycock-Horspool-Earley parser generator for Tcl
# Meta description     Aycock-Horspool-Earley parser generator for Tcl
# Meta license         BSD
# Meta platform        tcl
# Meta require         {Tcl 8.4}
# Meta require         {grammar::aycock::debug 1.0}
# Meta subject         parsing ambiguous parser horspool earley aycock
# Meta subject         transducer grammar
# Meta summary         grammar::aycock
# @@ Meta End


# ACTIVESTATE TEAPOT-PKG BEGIN REQUIREMENTS

package require Tcl 8.4
package require grammar::aycock::debug 1.0

# ACTIVESTATE TEAPOT-PKG END REQUIREMENTS

# ACTIVESTATE TEAPOT-PKG BEGIN DECLARE

package provide grammar::aycock 1.0

# ACTIVESTATE TEAPOT-PKG END DECLARE
# ACTIVESTATE TEAPOT-PKG END TM
#----------------------------------------------------------------------
#
# aycock-build.tcl --
#
#	Procedures needed to compile an Aycock-Horspool-Earley parser.
#
# Copyright (c) 2006 by Kevin B. Kenny.  All rights reserved.
#
# See the file "license.terms" for information on usage and redistribution
# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
#
# RCS: @(#) $Id: aycock-build.tcl,v 1.2 2011/01/13 02:47:47 andreas_kupries Exp $
#
#----------------------------------------------------------------------

package provide grammar::aycock 1.0
package require Tcl 8.4

# Bring in procedures that aid in debugging a parser; they will in turn
# bring in procedures that implement the runtime system.

package require grammar::aycock::debug 1.0

namespace eval grammar::aycock {

    # The 'aycock' namespace exports only the 'parser' command, which
    # constructs a parser.

    namespace export parser

}

# grammar::aycock::parser --
#
#	Creates an Aycock-Earley parser.
#
# Parameters:
#	rules - A list that can be broken down into productions.  
#	dump - The optional flag, '-verbose'. If supplied, the rules
#	       and resulting LRE(0) automaton are dumped to the standard
#	       output.
#
# Results:
#	Returns the name of a parser, which is an ensemble
#	supporting a number of subcommands for processing the
#	language defined by $rules.
#
# Each production takes the form
# 	symbol ::= rhs { action }
# where symbol is a single word defining a nonterminal
# symbol; rhs is the right-hand side (a sequence of nonterminal
# or terminal symbols) and action is a single word giving
# a script to execute when the production is reduced.  Within the
# action, a variable $_ is defined, which is a list of the same
# length as rhs giving the semantic values of each symbol on the
# right-hand side.

proc ::grammar::aycock::parser {rules {dump {}}} {
    set name [MakeParser]
    ProcessRules $name $rules
    ComputeNullable $name
    RewriteGrammar $name
    MakeState0 $name
    MakeState $name 0 \u22a2
    CompleteAutomaton $name
    unset ${name}::Cores
    if {$dump eq {-verbose}} {
	puts "parser: $name"
	puts "Rules:"
	DumpRuleSet $name stdout
	puts "------------------------------------------------------------"
	DumpAutomaton $name stdout
    }
    set l [NeverReduced $name]
    if {[llength $l] != 0} {
	return -code error "Rules never reduced: $l"
    }
    unset ${name}::Items
    return $name
}

# grammar::aycock::MakeParser --
#
#	Constructs the ensemble that will contain an Aycock parser.
#
# Results:
#	Returns the name of the parser, which is an ensemble within
#	the "aycock" namespace.
#
# The following commands are members of the ensemble:
#	parse -- Parses a sequence of symbols and returns its lexical
#		 value.
#	destroy -- Destroys the parser.
#	terminals -- Lists the terminal symbols accepted by the parser
#	nonterminals -- Lists the nonterminal symbols reduced by the parser
#	save -- Returns a command to recreate the parser without needing
#		to analyze the rule set.

proc ::grammar::aycock::MakeParser {} {
    variable parserCount
    set name [namespace current]::parser[incr parserCount]
    namespace eval $name {
	namespace export parse terminals nonterminals save destroy
    }
    proc ${name}::parse {symList vallist {clientData {}}} \
	[string map [list \
			 PROC [namespace current]::Parse \
			 PARSER $name] {
			     PROC PARSER $symList $vallist $clientData
			 }]
    proc ${name}::terminals {} \
	[list [namespace current]::Terminals $name]
    proc ${name}::nonterminals {} \
	[list [namespace current]::Nonterminals $name]
    proc ${name}::save {} \
	[list [namespace current]::Save $name]
    proc ${name}::destroy {} \
	[list namespace delete $name]
    namespace eval $name {
	namespace ensemble create
    }
    return $name
}

# grammar::aycock::ProcessRules --
#
#	Processes the rule set presented to grammar::aycock::parser
#
# Parameters:
#	parser -- Name of the parser
#	rules -- Rule set
#
# Results:
#	None.
#
# Side effects:
#	RuleSet is set to be a dictionary indexed by nonterminal symbol
#	name, whose values are alternating right-hand sides and names
#	of action procedures.  A set of Action procedures is constructed
#	for the reduction actions.

proc ::grammar::aycock::ProcessRules {parser rules} {
    namespace upvar $parser \
	RuleSet RuleSet \
	ActionProcs ActionProcs \
	APCount APCount

    # Locate the "::=" symbols within the rules.

    set RuleSet [dict create]
    set ActionProcs [dict create]
    set APCount 0
    set positions {}
    set i 0
    foreach sym $rules {
	if {$sym eq {::=}} {
	    lappend positions [expr {$i-1}]
	}
	incr i
    }
    lappend positions [llength $rules]

    # For each rule, place the right-hand side and action into
    # the appropriate RuleSet entry.

    set lastp [lindex $positions 0]
    set top [lindex $rules $lastp]
    foreach p [lrange $positions 1 end] {
	set lhs [lindex $rules $lastp]
	set rhs [lrange $rules [expr {$lastp + 2}] [expr {$p - 2}]]
	set action [MakeAction $parser [lindex $rules [expr {$p - 1}]]]
	set lastp $p
	dict lappend RuleSet $lhs $rhs
	dict lappend RuleSet $lhs $action
    }

    # Make a special "start" rule (whose name is the empty string)
    # whose right-hand side is "right tack" followed by the name of
    # the initial rule.

    dict lappend RuleSet {} [list \u22a2 $top]
    dict lappend RuleSet {} [MakeAction $parser {lindex $_ 1}]

    # Clean up memory.

    unset ${parser}::ActionProcs
    unset ${parser}::APCount
    return
}

# grammar::aycock::MakeAction --
#
#	Defines an action procedure for the parser to use at run time.
#
# Parameters:
#	parser -- Name of the parser
#	body -- Body of the action procedure, which is expected to
#		return the semantic value of some nonterminal after reduction.
#
# Results:
#	Returns the name of the action procedure.
#
# Side effects:
#	Creates the action procedure, which will accept a single parameter,
#	"_", containing the semantic values of the symbols on the right-hand
#	side.

proc ::grammar::aycock::MakeAction {parser {body {lindex $_ 0}}} {
    namespace upvar $parser \
	ActionProcs ActionProcs \
	APCount APCount
    if {$body eq {}} {
	set body {lindex $_ 0}
    }
    if {![dict exists $ActionProcs $body]} {
	set pname Action\#[incr APCount]
	dict set ActionProcs $body $pname
	namespace eval $parser [list proc $pname {_ clientData} $body]
    }
    return [dict get $ActionProcs $body]
}

# grammar::aycock::ComputeNullable --
#
#	Determines which rules in the parser's rule set are nullable, that
#	is, can match the empty sequence of input symbols.
#
# Parameters:
#	parser -- Name of the parser.
#
# Results:
#	None.
#
# Side effects:
#	Sets 'Nullable' to a dictionary whose keys are nonterminal symbol
#	names and whose values are 1 if the symbol is nullable and 0 otherwise.

proc ::grammar::aycock::ComputeNullable {parser} {
    namespace upvar $parser \
	RuleSet RuleSet \
	Nullable Nullable
    set Nullable [dict create]
    set tbd {}
    dict for {lhs rules} $RuleSet {
	dict set Nullable $lhs 0
	foreach {rhs action} $rules {
	    if {[llength $rhs] == 0} {
		dict set Nullable $lhs 1
	    } else {
		set ntonly 1
		foreach sym $rhs {
		    if {![dict exists $RuleSet $sym]} {
			set ntonly 0
			break
		    }
		}
		if {$ntonly} {
		    lappend tbd $lhs $rhs
		}
	    }
	}
    }
    set changed 1
    while {$changed} {
	set changed 0
	foreach {lhs rhs} $tbd {
	    if {![dict get $Nullable $lhs]} {
		set nullable 1
		foreach sym $rhs {
		    if {![dict get $Nullable $sym]} {
			set nullable 0
			break
		    }
		}
		if {$nullable} {
		    dict set Nullable $lhs 1
		    set changed 1
		}
	    }
	}
    }
    return
}

# grammar::aycock::RewriteGrammar --
#
#	Rewrite $parser's grammar into Nihilistic Normal Form {NNF}
#
# Parameters:
#	parser -- Parser to rewrite.
#
# Results:
#	None.
#
# Side effects:
#	Rewrites the rule set to separate nullable rules from other
#	rules.  The nullable rules are distinguished by having
#	"{\u00d8}" appended to their names.

proc ::grammar::aycock::RewriteGrammar {parser} {
    namespace upvar $parser \
	RuleSet RuleSet \
	Nullable Nullable
    set newRuleSet [dict create]

    # Create a work list wth all rules not yet examined

    set worklist {}
    dict for {lhs rules} $RuleSet {
	foreach {rhs action} $rules {
	    lappend worklist $lhs $rhs 0 1 $action
	}
    }

    # Process the rules in sequence from the worklist. For each rule,
    # determine whether it contains a sequence of nullable symbols
    # on the right-hand side.  If it does, split it on the last nullable
    # symbol. Continue until all possible splits have been done.

    for {set k 0} {$k < [llength $worklist]} {incr k 5} {
	foreach {lhs rhs position candidateFlag action} \
	    [lrange $worklist $k [expr {$k+4}]] break
	set n [llength $rhs]
	while {$position < $n} {
	    set sym [lindex $rhs $position]
	    if {![dict exists $Nullable $sym]
		|| ![dict get $Nullable $sym]} {
		set candidateFlag 0
	    } else {
		set newrhs $rhs
		lset newrhs $position ${sym}\{\u00d8\}
		lappend worklist $lhs $newrhs [expr {$position+1}] \
		    $candidateFlag $action
		set candidateFlag 0
	    }
	    incr position
	}
	if {$position >= $n} {
	    if {$candidateFlag} {
		set lhs ${lhs}\{\u00d8\}
	    }
	    dict lappend newRuleSet $lhs $rhs
	    dict lappend newRuleSet $lhs $action
	}
    }
    set RuleSet $newRuleSet
    unset Nullable
    return
}

# grammar::aycock::DumpRuleSet --
#
#	Displays the set of rules in a parser.
#
# Parameters:
#	parser - Name of the parser
#	chan - Channel on which to display the rules
#
# Results:
#	None.
#
# Side effects:
#	Displays the rule set on the given channel.

proc ::grammar::aycock::DumpRuleSet {parser chan} {
    namespace upvar $parser RuleSet RuleSet
    dict for {lhs rules} $RuleSet {
	dict for {rhs action} $rules {
	    puts $chan "$lhs ::= $rhs [list [info body ${parser}::${action}]]"
	}
    }
    return
}

# grammar::aycock::MakeState0 --
#
#	Makes the first state of a parser's automaton.
#
# Parameters:
#	parser -- Parser under construction.
#
# Results:
#	None.
#
# Side effects:
#	Builds a state corresponding to the reduction of the start
#	symbol.  Creates "Completions", "Items", "Cores", and "Edges";
#	Completions will be a list of lists of right-hand-sides
#	completed in each state.
#	Items will be a list of LRE(0) items belonging to
#	the states. Each item is represented as three elements:
#	the nonterminal symbol, the rule number in that nonterminal's
#	rule list, and the position of the dot within the right-hand side.
#	Edges will be a two-level dictionary - the outer key is state
#	number and the inner key is a symbol - giving the 'goto' symbol
#	for a given state and symbol.
#	Cores is a work dictionary used to avoid state duplication.

proc ::grammar::aycock::MakeState0 {parser} {
    namespace upvar $parser \
	RuleSet RuleSet \
	Completions Completions \
	Items Items \
	Cores Cores \
	Edges Edges
    set items {}
    set i 0
    foreach {rhs action} [dict get $RuleSet {}] {
	lappend items {} $i 0
	incr i 2
    }
    set Completions [list {}]
    set Items [list $items]
    set Cores [dict create]
    set Edges [dict create]
    return
}

# grammar::aycock::MakeState --
#
#	Constructs a state of the parsing automaton.
#
# Parameters:
#	parser -- Parser under construction
#	stateIdx - Ordinal number of a state being examined.
#	sym - Symbol whose goto is being computed
#
# Results:
#	Returns goto(state,sym)
#
# Side effects:
#	Constructs a new state if necessary, updating Completions, Items
#	Cores and Edges to reflect it.

proc ::grammar::aycock::MakeState {parser stateIdx sym} {
    namespace upvar $parser \
	RuleSet RuleSet \
	Completions Completions \
	Items Items \
	Cores Cores \
	Edges Edges

    if {$sym == {}} {
	error "Null symbol in MakeState"
    }

    set complete [lindex $Completions $stateIdx]

    # Compute the epsilon-kernel items for the given transition.

    set Kitems {}
    set items [lindex $Items $stateIdx]
    foreach {lhs prodIndex pos} $items {
	set rhs [lindex [dict get $RuleSet $lhs] $prodIndex]
	if {[lindex $rhs $pos] == $sym} {
	    set nextPos [SkipOver $rhs [expr {$pos+1}]]
	    lappend Kitems [list $lhs $prodIndex $nextPos]
	}
    }

    # Determine whether we've already built the state.

    set core {}
    foreach tuple \
	[lsort -index 0 \
	     [lsort -integer -index 1 \
		  [lsort -integer -index 2 $Kitems]]] {
		      foreach {lhs prodIndex pos} $tuple break
		      lappend core $lhs $prodIndex $pos
		  }

    if {[dict exists $Cores $core]} {
	return [dict get $Cores $core]
    }

    # We haven't built it yet - so we need to build it now.  Let k and
    # nk be the state numbers for the epsilon-kernel and epsilon-non-kernel
    # states.

    set k [llength $Items]
    set nk [expr {$k + 1}]

    set Kitems $core
    set NKitems {}

    set Kedges [dict create]
    set predicted [dict create]
    set Kcomplete {}

    # enumerate all the LRE(0) items in the epsilon-kernel set

    foreach {lhs rhsIndex pos} $Kitems {
	set rhs [lindex [dict get $RuleSet $lhs] $rhsIndex]
	if {$pos == [llength $rhs]} {
	    # reduction
	    lappend Kcomplete $lhs $rhsIndex $pos
	    continue
	} elseif {![dict exists $RuleSet [set nextSym [lindex $rhs $pos]]]} {
	    # transition on a terminal symbol
	    if {![dict exists $Kedges $nextSym] } {
		dict set Kedges $nextSym {}
	    }
	} else {
	    # GOTO on a nonterminal
	    dict set Kedges $nextSym {}
	    if {![dict exists $predicted $nextSym]} {
		dict set predicted $nextSym 1
		set prhsIndex 0
		foreach {prhs paction} [dict get $RuleSet $nextSym] {
		    set ppos [SkipOver $prhs]
		    lappend NKitems $nextSym $prhsIndex $ppos
		    incr prhsIndex 2
		}
	    }
	}
    }

    # build the state for the epsilon-kernel

    lappend Completions $Kcomplete
    lappend Items $Kitems
    dict set Edges $stateIdx $sym $k
    dict set Edges $k $Kedges

    if {[llength $NKitems] == 0} {
	return $k
    }

    # now start with the non-kernel set.  We need to build it before
    # we can figure out whether we've built it already

    set NKcomplete {}

    # enumerate all the LRE(0) items in the non-kernel set

    set NKedges [dict create]
    set w 0
    while {$w < [llength $NKitems] } {
	foreach {lhs rhsIndex pos} [lrange $NKitems $w [expr {$w+2}]] break
	incr w 3
	set rhs [lindex [dict get $RuleSet $lhs] $rhsIndex]
	if {$pos == [llength $rhs]} {
	    # reduction
	    lappend NKComplete [list $lhs $rhsIndex $pos]
	    continue
	}
	set nextSym [lindex $rhs $pos]
	if {![dict exists $RuleSet $nextSym]} {
	    # transition on a terminal symbol
	    if {![dict exists $NKedges $nextSym]} {
		dict set NKedges $nextSym {}
	    }
	} else {
	    # GOTO on a nonterminal
	    dict set NKedges $nextSym {}
	    if {![dict exists $predicted $nextSym]} {
		dict set predicted $nextSym 1
		set prhsIndex 0
		dict for {prhs paction} [dict get $RuleSet $nextSym] {
		    set ppos [SkipOver $prhs]
		    lappend NKitems $nextSym $prhsIndex $ppos
		    incr prhsIndex 2
		}
	    }
	}
    }

    # Now we might be able to add NKedges, and NK, or maybe we don't need to.

    set core [lsort [dict keys $predicted]]
    if {[dict exists $Cores $core]} {
	dict set Edges $k {} [dict get $Cores $core]
    } else {
	dict set Cores $core $nk
	dict set Edges $k {} $nk
	lappend Completions $NKcomplete
	lappend Items $NKitems
	dict set Edges $nk $NKedges
    }

    # Return the new kernel state's number.

    return $k
    
}

# grammar::aycock::SkipOver --
#
#	Service procedure that skips over nullable symbols beginning at
#	a given position on a right-hand side.
#
# Parameters:
#	rhs - Right-hand side being analyzed
#	pos - Starting position within the rhs
#
# Results:
#	Returns the index of the first non-nullable symbol after $pos,
#	which will be the fictitious symbol beyond the end of the right-hand
#	side if no non-nullable symbols remain.

proc ::grammar::aycock::SkipOver {rhs {pos 0}} {
    set n [llength $rhs]
    while {$pos < $n} {
	if {[string range [lindex $rhs $pos] end-2 end] ne "\{\u00d8\}"} {
	    break
	}
	incr pos
    }
    return $pos
}

# grammar::aycock::CopmpleteAutomaton --
#
#	Completes building the parser automaton once the first state
#	has been constructed.
#
# Parameters:
#	parser -- Name of the parser.
#
# Results:
#	None.
#
# Works by a brute-force approach: for each state, for each symbol
# that the state can transition on, add goto(state,symbol) to the
# state set; iterate until convergence.

proc ::grammar::aycock::CompleteAutomaton {parser} {
    namespace upvar $parser \
	RuleSet RuleSet \
	Items Items \
	Edges Edges
    
    set changes 1
    while {$changes} {
	set changes 0
	set worklist {}
	dict for {state d} [dict get $Edges] {
	    dict for {sym v} $d {
		if {$v eq {}} {
 		    if {$state < [llength $Items]} {
 			lappend worklist \
 			    [list $state [dict exists $RuleSet $sym] $sym]
 			set changes 1
 		    }
		}
	    }
	}
	foreach tuple \
	    [lsort -integer -index 0 \
		 [lsort -integer -index 1 \
		      [lsort -dictionary -index 2 $worklist]]] {
		foreach {state - sym} $tuple break
		::grammar::aycock::GoTo $parser $state $sym
	    }
    }
}

# grammar::aycock::GoTo --
#
#	Computes goto(state,symbol) in a parser.
#
# Parameters:
#	parser -- Name of the parser
#	state -- Index of the state
#	sym -- Symbol whose goto is being computed.
#
# Results:
#	Returns the goto entry.
#
# Side effects:
#	Constructs a new state if needed.

proc ::grammar::aycock::GoTo {parser state sym} {
    namespace upvar $parser Edges Edges
    if {![dict exists $Edges $state] || ![dict exists $Edges $state $sym]} {
	return {}
    } else {
	set rv [dict get $Edges $state $sym]
	if {$rv eq {}} {
	    set rv [MakeState $parser $state $sym]
	    dict set Edges $state $sym $rv
	}
    }
    return $rv
}

# grammar::aycock::DumpAutomaton --
#
#	Displays the parsing automaton of an Aycock-Earley parser on a
#	channel.
#
# Parameters:
#	parser - Parser to display
#	chan - Channel to use
#
# Results:
#	None.
#
# Side effects:
#	Dumps the grammar (in NNF) and the states of the parsing
#	automaton.  For each state, indicates the LRE(0) items in that
#	state, the completion list for the state, and the GOTO function
#	for the state.
    
proc ::grammar::aycock::DumpAutomaton {parser chan} {
    namespace upvar $parser \
	Completions Completions \
	Items Items \
	Edges Edges
    for {set ns 0} {$ns < [llength $Completions]} {incr ns} {
	set completions [lindex $Completions $ns]
	puts $chan "state $ns:"
	if {[info exists Items]} {
	    set items [lindex $Items $ns]
	    DumpItemSet $parser $items $chan
	    puts $chan "  ------------------------------"
	}
	puts $chan "  completions:"
	DumpItemSet $parser $completions $chan
	puts $chan "  ------------------------------"
	puts $chan "  goto:"
	set worklist {}
	dict for {sym nexts} [dict get $Edges $ns] {
	    if {$sym eq {}} {
		set sym \u03b5
	    }
	    lappend worklist [list $sym $nexts]
	}
	foreach pair [lsort -integer -index 1 $worklist] {
	    foreach {sym nexts} $pair break
	    puts $chan [format "    %-22s%4d" $sym $nexts]
	}
	puts $chan "------------------------------------"
    }
}