Source code for nlp4e

"""Natural Language Processing (Chapter 22)"""

from collections import defaultdict
from utils4e import weighted_choice
import copy
import operator
import heapq
from search import Problem


# ______________________________________________________________________________
# 22.2 Grammars



[docs]
def Rules(**rules):
    """Create a dictionary mapping symbols to alternative sequences.
    >>> Rules(A = "B C | D E")
    {'A': [['B', 'C'], ['D', 'E']]}
    """
    for (lhs, rhs) in rules.items():
        rules[lhs] = [alt.strip().split() for alt in rhs.split('|')]
    return rules




[docs]
def Lexicon(**rules):
    """Create a dictionary mapping symbols to alternative words.
    >>> Lexicon(Article = "the | a | an")
    {'Article': ['the', 'a', 'an']}
    """
    for (lhs, rhs) in rules.items():
        rules[lhs] = [word.strip() for word in rhs.split('|')]
    return rules




[docs]
class Grammar:
    """A context-free grammar defined by a set of rewrite rules and a lexicon.

    Rules map non-terminal categories to alternative right-hand sides, while the
    lexicon maps categories to the words that belong to them."""

    def __init__(self, name, rules, lexicon):
        """A grammar has a set of rules and a lexicon."""
        self.name = name
        self.rules = rules
        self.lexicon = lexicon
        self.categories = defaultdict(list)
        for lhs in lexicon:
            for word in lexicon[lhs]:
                self.categories[word].append(lhs)


[docs]
    def rewrites_for(self, cat):
        """Return a sequence of possible rhs's that cat can be rewritten as."""
        return self.rules.get(cat, ())



[docs]
    def isa(self, word, cat):
        """Return True iff word is of category cat"""
        return cat in self.categories[word]



[docs]
    def cnf_rules(self):
        """Returns the tuple (X, Y, Z) for rules in the form:
        X -> Y Z"""
        cnf = []
        for X, rules in self.rules.items():
            for (Y, Z) in rules:
                cnf.append((X, Y, Z))

        return cnf



[docs]
    def generate_random(self, S='S'):
        """Replace each token in S by a random entry in grammar (recursively)."""
        import random

        def rewrite(tokens, into):
            for token in tokens:
                if token in self.rules:
                    rewrite(random.choice(self.rules[token]), into)
                elif token in self.lexicon:
                    into.append(random.choice(self.lexicon[token]))
                else:
                    into.append(token)
            return into

        return ' '.join(rewrite(S.split(), []))


    def __repr__(self):
        return '<Grammar {}>'.format(self.name)




[docs]
def ProbRules(**rules):
    """Create a dictionary mapping symbols to alternative sequences,
    with probabilities.
    >>> ProbRules(A = "B C [0.3] | D E [0.7]")
    {'A': [(['B', 'C'], 0.3), (['D', 'E'], 0.7)]}
    """
    for (lhs, rhs) in rules.items():
        rules[lhs] = []
        rhs_separate = [alt.strip().split() for alt in rhs.split('|')]
        for r in rhs_separate:
            prob = float(r[-1][1:-1])  # remove brackets, convert to float
            rhs_rule = (r[:-1], prob)
            rules[lhs].append(rhs_rule)

    return rules




[docs]
def ProbLexicon(**rules):
    """Create a dictionary mapping symbols to alternative words,
    with probabilities.
    >>> ProbLexicon(Article = "the [0.5] | a [0.25] | an [0.25]")
    {'Article': [('the', 0.5), ('a', 0.25), ('an', 0.25)]}
    """
    for (lhs, rhs) in rules.items():
        rules[lhs] = []
        rhs_separate = [word.strip().split() for word in rhs.split('|')]
        for r in rhs_separate:
            prob = float(r[-1][1:-1])  # remove brackets, convert to float
            word = r[:-1][0]
            rhs_rule = (word, prob)
            rules[lhs].append(rhs_rule)

    return rules




[docs]
class ProbGrammar:
    """A probabilistic context-free grammar.

    Like :class:`Grammar`, but every rule and lexicon entry carries a
    probability, so derivations can be sampled and scored by likelihood."""

    def __init__(self, name, rules, lexicon):
        """A grammar has a set of rules and a lexicon.
        Each rule has a probability."""
        self.name = name
        self.rules = rules
        self.lexicon = lexicon
        self.categories = defaultdict(list)

        for lhs in lexicon:
            for word, prob in lexicon[lhs]:
                self.categories[word].append((lhs, prob))


[docs]
    def rewrites_for(self, cat):
        """Return a sequence of possible rhs's that cat can be rewritten as."""
        return self.rules.get(cat, ())



[docs]
    def isa(self, word, cat):
        """Return True iff word is of category cat"""
        return cat in [c for c, _ in self.categories[word]]



[docs]
    def cnf_rules(self):
        """Returns the tuple (X, Y, Z, p) for rules in the form:
        X -> Y Z [p]"""
        cnf = []
        for X, rules in self.rules.items():
            for (Y, Z), p in rules:
                cnf.append((X, Y, Z, p))

        return cnf



[docs]
    def generate_random(self, S='S'):
        """Replace each token in S by a random entry in grammar (recursively).
        Returns a tuple of (sentence, probability)."""

        def rewrite(tokens, into):
            for token in tokens:
                if token in self.rules:
                    non_terminal, prob = weighted_choice(self.rules[token])
                    into[1] *= prob
                    rewrite(non_terminal, into)
                elif token in self.lexicon:
                    terminal, prob = weighted_choice(self.lexicon[token])
                    into[0].append(terminal)
                    into[1] *= prob
                else:
                    into[0].append(token)
            return into

        rewritten_as, prob = rewrite(S.split(), [[], 1])
        return (' '.join(rewritten_as), prob)


    def __repr__(self):
        return '<Grammar {}>'.format(self.name)



E0 = Grammar('E0',
             Rules(  # Grammar for E_0 [Figure 22.2]
                 S='NP VP | S Conjunction S',
                 NP='Pronoun | Name | Noun | Article Noun | Digit Digit | NP PP | NP RelClause',
                 VP='Verb | VP NP | VP Adjective | VP PP | VP Adverb',
                 PP='Preposition NP',
                 RelClause='That VP'),

             Lexicon(  # Lexicon for E_0 [Figure 22.3]
                 Noun="stench | breeze | glitter | nothing | wumpus | pit | pits | gold | east",
                 Verb="is | see | smell | shoot | fell | stinks | go | grab | carry | kill | turn | feel",  # noqa
                 Adjective="right | left | east | south | back | smelly | dead",
                 Adverb="here | there | nearby | ahead | right | left | east | south | back",
                 Pronoun="me | you | I | it",
                 Name="John | Mary | Boston | Aristotle",
                 Article="the | a | an",
                 Preposition="to | in | on | near",
                 Conjunction="and | or | but",
                 Digit="0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9",
                 That="that"
             ))

E_ = Grammar('E_',  # Trivial Grammar and lexicon for testing
             Rules(
                 S='NP VP',
                 NP='Art N | Pronoun',
                 VP='V NP'),

             Lexicon(
                 Art='the | a',
                 N='man | woman | table | shoelace | saw',
                 Pronoun='I | you | it',
                 V='saw | liked | feel'
             ))

E_NP_ = Grammar('E_NP_',  # Another Trivial Grammar for testing
                Rules(NP='Adj NP | N'),
                Lexicon(Adj='happy | handsome | hairy',
                        N='man'))

E_Prob = ProbGrammar('E_Prob',  # The Probabilistic Grammar from the notebook
                     ProbRules(
                         S="NP VP [0.6] | S Conjunction S [0.4]",
                         NP="Pronoun [0.2] | Name [0.05] | Noun [0.2] | Article Noun [0.15] \
                             | Article Adjs Noun [0.1] | Digit [0.05] | NP PP [0.15] | NP RelClause [0.1]",
                         VP="Verb [0.3] | VP NP [0.2] | VP Adjective [0.25] | VP PP [0.15] | VP Adverb [0.1]",
                         Adjs="Adjective [0.5] | Adjective Adjs [0.5]",
                         PP="Preposition NP [1]",
                         RelClause="RelPro VP [1]"
                     ),
                     ProbLexicon(
                         Verb="is [0.5] | say [0.3] | are [0.2]",
                         Noun="robot [0.4] | sheep [0.4] | fence [0.2]",
                         Adjective="good [0.5] | new [0.2] | sad [0.3]",
                         Adverb="here [0.6] | lightly [0.1] | now [0.3]",
                         Pronoun="me [0.3] | you [0.4] | he [0.3]",
                         RelPro="that [0.5] | who [0.3] | which [0.2]",
                         Name="john [0.4] | mary [0.4] | peter [0.2]",
                         Article="the [0.5] | a [0.25] | an [0.25]",
                         Preposition="to [0.4] | in [0.3] | at [0.3]",
                         Conjunction="and [0.5] | or [0.2] | but [0.3]",
                         Digit="0 [0.35] | 1 [0.35] | 2 [0.3]"
                     ))

E_Chomsky = Grammar('E_Prob_Chomsky',  # A Grammar in Chomsky Normal Form
                    Rules(
                        S='NP VP',
                        NP='Article Noun | Adjective Noun',
                        VP='Verb NP | Verb Adjective',
                    ),
                    Lexicon(
                        Article='the | a | an',
                        Noun='robot | sheep | fence',
                        Adjective='good | new | sad',
                        Verb='is | say | are'
                    ))

E_Prob_Chomsky = ProbGrammar('E_Prob_Chomsky',  # A Probabilistic Grammar in CNF
                             ProbRules(
                                 S='NP VP [1]',
                                 NP='Article Noun [0.6] | Adjective Noun [0.4]',
                                 VP='Verb NP [0.5] | Verb Adjective [0.5]',
                             ),
                             ProbLexicon(
                                 Article='the [0.5] | a [0.25] | an [0.25]',
                                 Noun='robot [0.4] | sheep [0.4] | fence [0.2]',
                                 Adjective='good [0.5] | new [0.2] | sad [0.3]',
                                 Verb='is [0.5] | say [0.3] | are [0.2]'
                             ))
E_Prob_Chomsky_ = ProbGrammar('E_Prob_Chomsky_',
                              ProbRules(
                                  S='NP VP [1]',
                                  NP='NP PP [0.4] | Noun Verb [0.6]',
                                  PP='Preposition NP [1]',
                                  VP='Verb NP [0.7] | VP PP [0.3]',
                              ),
                              ProbLexicon(
                                  Noun='astronomers [0.18] | eyes [0.32] | stars [0.32] | telescopes [0.18]',
                                  Verb='saw [0.5] | \'\' [0.5]',
                                  Preposition='with [1]'
                              ))


# ______________________________________________________________________________
# 22.3 Parsing



[docs]
class Chart:
    """Class for parsing sentences using a chart data structure.
    >>> chart = Chart(E0)
    >>> len(chart.parses('the stench is in 2 2'))
    1
    """

    def __init__(self, grammar, trace=False):
        """A datastructure for parsing a string; and methods to do the parse.
        self.chart[i] holds the edges that end just before the i'th word.
        Edges are 5-element lists of [start, end, lhs, [found], [expects]]."""
        self.grammar = grammar
        self.trace = trace


[docs]
    def parses(self, words, S='S'):
        """Return a list of parses; words can be a list or string."""
        if isinstance(words, str):
            words = words.split()
        self.parse(words, S)
        # Return all the parses that span the whole input
        # 'span the whole input' => begin at 0, end at len(words)
        return [[i, j, S, found, []]
                for (i, j, lhs, found, expects) in self.chart[len(words)]
                # assert j == len(words)
                if i == 0 and lhs == S and expects == []]



[docs]
    def parse(self, words, S='S'):
        """Parse a list of words; according to the grammar.
        Leave results in the chart."""
        self.chart = [[] for i in range(len(words) + 1)]
        self.add_edge([0, 0, 'S_', [], [S]])
        for i in range(len(words)):
            self.scanner(i, words[i])
        return self.chart



[docs]
    def add_edge(self, edge):
        """Add edge to chart, and see if it extends or predicts another edge."""
        start, end, lhs, found, expects = edge
        if edge not in self.chart[end]:
            self.chart[end].append(edge)
            if self.trace:
                print('Chart: added {}'.format(edge))
            if not expects:
                self.extender(edge)
            else:
                self.predictor(edge)



[docs]
    def scanner(self, j, word):
        """For each edge expecting a word of this category here, extend the edge."""
        for (i, j, A, alpha, Bb) in self.chart[j]:
            if Bb and self.grammar.isa(word, Bb[0]):
                self.add_edge([i, j + 1, A, alpha + [(Bb[0], word)], Bb[1:]])



[docs]
    def predictor(self, edge):
        """Add to chart any rules for B that could help extend this edge."""
        (i, j, A, alpha, Bb) = edge
        B = Bb[0]
        if B in self.grammar.rules:
            for rhs in self.grammar.rewrites_for(B):
                self.add_edge([j, j, B, [], rhs])



[docs]
    def extender(self, edge):
        """See what edges can be extended by this edge."""
        (j, k, B, _, _) = edge
        for (i, j, A, alpha, B1b) in self.chart[j]:
            if B1b and B == B1b[0]:
                self.add_edge([i, k, A, alpha + [edge], B1b[1:]])




# ______________________________________________________________________________
# CYK Parsing



[docs]
class Tree:
    """A simple parse-tree node with a root label and a list of child leaves
    (which may themselves be subtrees), as built by CYK parsing."""

    def __init__(self, root, *args):
        self.root = root
        self.leaves = [leaf for leaf in args]




[docs]
def CYK_parse(words, grammar):
    """ [Figure 22.6] """
    # We use 0-based indexing instead of the book's 1-based.
    P = defaultdict(float)
    T = defaultdict(Tree)

    # Insert lexical categories for each word.
    for (i, word) in enumerate(words):
        for (X, p) in grammar.categories[word]:
            P[X, i, i] = p
            T[X, i, i] = Tree(X, word)

    # Construct X(i:k) from Y(i:j) and Z(j+1:k), shortest span first
    for i, j, k in subspan(len(words)):
        for (X, Y, Z, p) in grammar.cnf_rules():
            PYZ = P[Y, i, j] * P[Z, j + 1, k] * p
            if PYZ > P[X, i, k]:
                P[X, i, k] = PYZ
                T[X, i, k] = Tree(X, T[Y, i, j], T[Z, j + 1, k])

    return T




[docs]
def subspan(N):
    """returns all tuple(i, j, k) covering a span (i, k) with i <= j < k"""
    for length in range(2, N + 1):
        for i in range(1, N + 2 - length):
            k = i + length - 1
            for j in range(i, k):
                yield (i, j, k)



# using search algorithms in the searching part



[docs]
class TextParsingProblem(Problem):
    """A search problem that parses a list of words bottom-up into a goal symbol.

    States are partially-reduced word/category sequences; actions replace words
    with their lexical categories and replace spans of categories by the rules
    that produce them, so that a solution path corresponds to a valid parse."""

    def __init__(self, initial, grammar, goal='S'):
        """
        :param initial: the initial state of words in a list.
        :param grammar: a grammar object
        :param goal: the goal state, usually S
        """
        super(TextParsingProblem, self).__init__(initial, goal)
        self.grammar = grammar
        self.combinations = defaultdict(list)  # article combinations
        # backward lookup of rules
        for rule in grammar.rules:
            for comb in grammar.rules[rule]:
                self.combinations[' '.join(comb)].append(rule)


[docs]
    def actions(self, state):
        """Return the successor states reachable from ``state``: first replace each
        word by each of its lexical categories, and once only categories remain
        replace spans that match a grammar rule's right-hand side by that rule."""
        actions = []
        categories = self.grammar.categories
        # first change each word to the article of its category
        for i in range(len(state)):
            word = state[i]
            if word in categories:
                for X in categories[word]:
                    state[i] = X
                    actions.append(copy.copy(state))
                    state[i] = word
        # if all words are replaced by articles, replace combinations of articles by inferring rules.
        if not actions:
            for start in range(len(state)):
                for end in range(start, len(state) + 1):
                    # try combinations between (start, end)
                    articles = ' '.join(state[start:end])
                    for c in self.combinations[articles]:
                        actions.append(state[:start] + [c] + state[end:])
        return actions



[docs]
    def result(self, state, action):
        """Return the state resulting from applying ``action`` (the action is itself
        the already-computed successor state)."""
        return action



[docs]
    def h(self, state):
        """Heuristic estimating remaining cost as the number of symbols still left
        to reduce in ``state``."""
        # heuristic function
        return len(state)





[docs]
def astar_search_parsing(words, gramma):
    """bottom-up parsing using A* search to find whether a list of words is a sentence"""
    # init the problem
    problem = TextParsingProblem(words, gramma, 'S')
    state = problem.initial
    # init the searching frontier
    frontier = [(len(state) + problem.h(state), state)]
    heapq.heapify(frontier)

    while frontier:
        # search the frontier node with lowest cost first
        cost, state = heapq.heappop(frontier)
        actions = problem.actions(state)
        for action in actions:
            new_state = problem.result(state, action)
            # update the new frontier node to the frontier
            if new_state == [problem.goal]:
                return problem.goal
            if new_state != state:
                heapq.heappush(frontier, (len(new_state) + problem.h(new_state), new_state))
    return False




[docs]
def beam_search_parsing(words, gramma, b=3):
    """bottom-up text parsing using beam search"""
    # init problem
    problem = TextParsingProblem(words, gramma, 'S')
    # init frontier
    frontier = [(len(problem.initial), problem.initial)]
    heapq.heapify(frontier)

    # explore the current frontier and keep b new states with lowest cost
    def explore(frontier):
        new_frontier = []
        for cost, state in frontier:
            # expand the possible children states of current state
            if not problem.goal_test(' '.join(state)):
                actions = problem.actions(state)
                for action in actions:
                    new_state = problem.result(state, action)
                    if [len(new_state), new_state] not in new_frontier and new_state != state:
                        new_frontier.append([len(new_state), new_state])
            else:
                return problem.goal
        heapq.heapify(new_frontier)
        # only keep b states
        return heapq.nsmallest(b, new_frontier)

    while frontier:
        frontier = explore(frontier)
        if frontier == problem.goal:
            return frontier
    return False



# ______________________________________________________________________________
# 22.4 Augmented Grammar


g = Grammar("arithmetic_expression",  # A Grammar of Arithmetic Expression
            rules={
                'Number_0': 'Digit_0', 'Number_1': 'Digit_1', 'Number_2': 'Digit_2',
                'Number_10': 'Number_1 Digit_0', 'Number_11': 'Number_1 Digit_1',
                'Number_100': 'Number_10 Digit_0',
                'Exp_5': ['Number_5', '( Exp_5 )', 'Exp_1, Operator_+ Exp_4', 'Exp_2, Operator_+ Exp_3',
                          'Exp_0, Operator_+ Exp_5', 'Exp_3, Operator_+ Exp_2', 'Exp_4, Operator_+ Exp_1',
                          'Exp_5, Operator_+ Exp_0', 'Exp_1, Operator_* Exp_5'],  # more possible combinations
                'Operator_+': operator.add, 'Operator_-': operator.sub, 'Operator_*': operator.mul,
                'Operator_/': operator.truediv,
                'Digit_0': 0, 'Digit_1': 1, 'Digit_2': 2, 'Digit_3': 3, 'Digit_4': 4
            },
            lexicon={})

g = Grammar("Ali loves Bob",  # A example grammar of Ali loves Bob example
            rules={
                "S_loves_ali_bob": "NP_ali, VP_x_loves_x_bob", "S_loves_bob_ali": "NP_bob, VP_x_loves_x_ali",
                "VP_x_loves_x_bob": "Verb_xy_loves_xy NP_bob", "VP_x_loves_x_ali": "Verb_xy_loves_xy NP_ali",
                "NP_bob": "Name_bob", "NP_ali": "Name_ali"
            },
            lexicon={
                "Name_ali": "Ali", "Name_bob": "Bob", "Verb_xy_loves_xy": "loves"
            })