from itertools import count # used to create a generator to generate an id for each state.
from collections import OrderedDict
import graphviz

'''
Create a class MinDict to implement the Daciuk-Algorithm. 
'''
class MinDict():
    def __init__(self, words):
        self.final_states : set[int] = set()  # a set used to store all final states
        self.curr_id = count()  # create an id generator: next_id = next(curr_id)
        self.start = next(self.curr_id)  # the start state
        self.register : set[int] = set()  # a set used to store all registered states, which are inequal to each other
        self.tr : dict[int, OrderedDict[str, int]] = {self.start:OrderedDict()}  # a dictionary used to store all states and their transitions
        self.compile(words)  # compile all functions to build the final minimal dictionary.
        
        self.language = list() # init a list used to store the words acquired via traversal
    
    # This methode is used to calculate the common prefix of two given words.
    def common_prefix(self, word, pre_word):
        """
        @param word: currently read-in word
        @parm pre_word: previously read-in word
        return: the common prefix of both words
        """
        common_prefix = ''  # init the common prefix
        
        # TODO: Return the common prefix of two words.
#         pass
        for i, c in enumerate(word):
            if i < len(pre_word):
                if c == pre_word[i]:
                    common_prefix += c
                else:
                    return common_prefix
            else:
                return common_prefix
                
            
    
    def split_state(self, common_prefix):
        """
        @param common_predfix: a substring, the common prefix
        return: the split state
        """
        split_state = self.start
        if common_prefix == '':
            return split_state
        else:
            for i, c in enumerate(common_prefix):
                split_state = self.tr[split_state][c]
            return split_state
    
    def has_children(self, state):
        """
        @param: state id
        return: boolean value, if the given state has children or not
        """
        # TODO: Return True if the state has children, otherwise False.
        # Hint: That self.has_children(state) is true means self.tr[state] is not an empty OrderedDict.
        return self.tr[state]

    
    def is_equal(self, s1, s2):
        """
        @param: two state ids
        return: boolean value, if the given states are equal
        """
        # TODO: If two states s1 and s2 are equal, return True, otherwise False.
        # Hint: Two conditions need to be satisfied if s1 and s2 are equal:
            # a. Both s1 and s2 are final or non-final states.
            # b. The transitions of s1 and s2 are the same.
        return (s1 in self.final_states) == (s2 in self.final_states) \
                and self.tr[s1] == self.tr[s2]
    
    def replace_or_register(self, state):
        """
        @param: a state
        """
        last_child = self.tr[state][next(reversed(self.tr[state]))]  # get the last child state from the state transition.
        if self.has_children(last_child):
            self.replace_or_register(last_child)
        # replace and delete the last child state if it has an equal state in the set of the registered states.
        for s in self.register:
            if self.is_equal(last_child, s):
                # TODO: Replace the last child with the found equal state in th registered set.
                self.tr[state][next(reversed(self.tr[state]))] = s
            
                del self.tr[last_child] # delete the last child state
                return
        # add the last child into the register set if no equal states are found
        self.register.add(last_child)
    
    '''
    This function is used to add the current suffix after the split state
    @param suffix: a substring 
    @param state: state id
    '''
    def add_suffix(self, suffix, state):
        for c in suffix:
            # TODO: Use the count generator self.curr_id to create the id of next state. (Replace None with your code.)
            next_state = next(self.curr_id)
            if state not in self.tr:
                self.tr[state] = OrderedDict() 
            self.tr[state][c] = next_state
            state = next_state
        
        self.tr[state] = OrderedDict()
        # TODO: Add the last state to the set of final states. 
        self.final_states.add(state)
    
    def compile(self, words):
        """
        @param: a sorted list of words
        gather all parts together to implement the Daciuk-Algorithm.
        """
        pre_word = ''
        for word in words:
            common_prefix = self.common_prefix(word, pre_word)
            split_state = self.split_state(common_prefix)
            current_suffix = word[len(common_prefix):]
            
            if self.has_children(split_state):
                self.replace_or_register(split_state)
            self.add_suffix(current_suffix, split_state)
            pre_word = word
        
        self.replace_or_register(self.start)
        self.register.add(self.start)

# define a function with the help of graphviz to draw an automoton.
def draw_automaton(states, final_states, initial_state, delta):
    
    g = graphviz.Digraph('MinDict')
    
    for state in states:
        if state in final_states:
            g.attr('node',style='bold')
        if state == initial_state:
            g.node(str(state),labels = '->' + str(state))
        else:
            g.node(str(state))
        g.attr('node',style='solid')
    
    for x, label, z in delta:
        g.edge(str(x),str(z),label=' '+label+' ')
        
    return g


def main():
    dictionary = ["acker", "alle", "alraune", "as", "aspekt"]
    min_dict = MinDict(dictionary)  # Instantiate a minimal dictionary.
    
    # retrieve the elements from min_dict to formalize a 5-tuple for an automoton.
    start = str(0)  # start state
    states = {str(i) for i in min_dict.register}  # set of all states
    final_states = {str(i) for i in min_dict.final_states if i in min_dict.register}  # set of final states
    delta = set()  # transition function
    for p in min_dict.tr:
        if p in min_dict.register:
            for c in min_dict.tr[p]:
                q = min_dict.tr[p][c]
                delta.add((str(p), c, str(q)))
    
    # draw the minimal dictionary
    g = draw_automaton(states, final_states, start, delta)
    g.render('aut1.gv', view=True)

if __name__ == '__main__':
    main()


from collections import OrderedDict
import graphviz

class State():
    def __init__(self):
        self.is_final = False  # set non-final state as default
        self.tr = OrderedDict()  # initialize state transitions with empty OrderedDict
        self.id = None
    
    # overload the operator "=="
    def __eq__(self, other):
        return self.is_final is other.is_final and self.tr == other.tr
    
    # set attribute last child
    @property
    def last_child(self):
        return self.tr[next(reversed(self.tr))]
    
    @last_child.setter
    def last_child(self, new_child):
        self.tr[next(reversed(self.tr))] = new_child
        

class MinDict():
    def __init__(self, words):
        self.start = State()  
        self.register = list()  # create a list to store registered states instead of set, because list can store the read-in sequence as well.
        self.compile(words)  # compile all functions to build the final minimal dictionary.
        self.set_id()  # assign each state an id

        self.language = list() # init a list used to store the words acquired via traversal
    
    # This methode is used to calculate the common prefix of two given words.
    def common_prefix(self, word, pre_word):
        common_prefix = ''
        for i, c in enumerate(word):
            if i < len(pre_word):
                if c ==  pre_word[i]:
                    common_prefix += c
                else:
                    return common_prefix
            else:
                return common_prefix
    
    def split_state(self, common_prefix):
        split_state = self.start
        if common_prefix == '':
            return split_state
        else:
            for c in common_prefix:
                split_state = split_state.tr[c]  # TODO: Update the split state. (Replace None with your own code)
            return split_state
    
    def has_children(self, state):
        return state.tr
    
    def register_or_replace(self, state):
        child = state.last_child
        if self.has_children(child):
            self.register_or_replace(child)
        
        for s in self.register:
            if child == s:
                state.last_child = s
                del child
                return
        
        self.register.append(child)
    
    def add_suffix(self, suffix, state):
        for c in suffix:
            next_state = State() #TODO: Create the next state by instantiating a State() class object. (Replace None with your code)
            state.tr[c] = next_state
            state = next_state
#         pass #TODO: Set the last state as final state.
        state.is_final = True
        
    
    def compile(self, words):
        pre_word = ''
        for word in words:
            common_prefix = self.common_prefix(word, pre_word)
            split_state = self.split_state(common_prefix)
            current_suffix = word[len(common_prefix):]
            if self.has_children(split_state):
                self.register_or_replace(split_state)
            self.add_suffix(current_suffix, split_state)
            pre_word = word
        
        self.register_or_replace(self.start)
        self.register.append(self.start)
    
    def set_id(self):
        for index, s in enumerate(self.register):
            s.id = index

    # Exercise 06: Traverse the automaton
    # traverse the automaton to get the language    
    def get_language(self, state=None, transition=str()):
        if state is None:
            state = self.start
        
        for c, next_state in state.tr.items():
            new_transition = transition + c
            if next_state.is_final:
                self.language.append(new_transition) # yield
            self.get_language(next_state, new_transition)    
    # all words are stored in self.language
        
# define a function with the help of graphviz to draw an automoton.
def draw_automaton(states, final_states, initial_state, delta):
    
    g = graphviz.Digraph('MinDict')
    
    for state in states:
        if state in final_states:
            g.attr('node',style='bold')
        if state == initial_state:
            g.node(str(state),labels = '->' + str(state))
        else:
            g.node(str(state))
        g.attr('node',style='solid')
    
    for x, label, z in delta:
        g.edge(str(x),str(z),label=' '+label+' ')
        
    return g

def main():
    dictionary = ["acker", "alle", "alraune", "as", "aspekt"]
    min_dict = MinDict(dictionary)
    
    min_dict.get_language()
    print(min_dict.language)
    
    # retrieve the elements from min_dict to formalize a 5-tuple for an automoton.
#     final_states = set()
#     states = set()
#     delta = set()
#     start_state = str(min_dict.start.id)
    
#     s_index = enumerate(min_dict.register)
    
#     for state in min_dict.register:
#         index = str(state.id)
#         states.add(index)
#         if state.is_final:
#             final_states.add(index)
#         for c in state.tr:
#             out_state_index = str(state.tr[c].id)
#             delta.add((index, c, out_state_index))
    
#     # draw the minimal dictionary
#     g = draw_automaton(states, final_states, start_state, delta)
#     g.render('graphviz/aut_oop3.gv', view=True)

if __name__ == '__main__':
    main()

['acker', 'alle', 'alraune', 'as', 'aspekt']

Übung Basismodul Computerlinguistik¶

Hausaufgabe 05 Implementierung des Daciuk-Algorithmus¶

Möglichkeit 1 - Zustand durch Integer repränsentiert¶

Möglichkeit 2 - Erstelle eine Klasse für Zustand (OOP)¶