File: lipsum.py - Tab length: 1 2 4 8 - Lines: on off - No wrap: on off

#!/usr/bin/env python

import argparse
import marshal
import random
import regex

class Lipsum:
        clean = regex.compile ('\\s+$')

        """
        Create a new lipsum instance by either training a model from given input
        text file, or loading a previously saved model. Behavior actually depends
        on whether the 3rd parameter (length) is provided since it's only used when
        training a new model (but yeah, that's still an ugly hack).
        """
        def __init__ (self, path, length = None):
                # If length is none we assume input is a trained model
                if length is None:
                        with open (args.model, 'rb') as file:
                                (self.model, self.length) = marshal.load (file)

                # Otherwise start training
                else:
                        self.length = length
                        self.model = {}

                        # Tokenize input string into (prefix -> (suffix, count)) tree
                        tokenize = regex.compile ('((?:[-\\w]+|[^\\n.])\\s*)*(?:[\\n.]\\s*)?')
                        tree = {}

                        with open (path, 'r') as file:
                                for match in tokenize.finditer (file.read ()):
                                        lexems = map (self.clean_lexem, match.captures (1))

                                        # Ignore empty sequences
                                        if len (lexems) < 1:
                                                continue

                                        # Register suffixes, including special "end of line" marker
                                        prefix = (None, ) * length

                                        for lexem in lexems + [None]:
                                                suffixes = tree.setdefault (prefix, {})
                                                suffixes[lexem] = suffixes.get (lexem, 0) + 1

                                                prefix = prefix[1:] + (lexem, )

                        # Convert to (prefix -> (suffix, probability)) model
                        for (key, suffixes) in tree.iteritems ():
                                occurrences = float (sum ((count for (suffix, count) in suffixes.iteritems ())))
                                thresholds = []
                                total = 0

                                for (lexem, count) in suffixes.iteritems ():
                                        total += count / occurrences

                                        thresholds.append ((lexem, total))

                                self.model[key] = thresholds

        """
        Cleanup input lexem by squashing all "dirty" characters (see the "clean"
        regular expression above) into a single space character.
        """
        def clean_lexem (self, lexem):
                return self.clean.sub (' ', lexem).lower ()

        """
        Find first suffix above given value (used for random suffix selection).
        This method could/should be replaced by some functional call like:
        bisect (suffixes, lambda suffix: suffix[1], value)
        """
        def first_above (self, suffixes, value):
                i = 0

                while i < len (suffixes) and suffixes[i][1] <= value:
                        i += 1

                return i < len (suffixes) and suffixes[i][0] or None

        """
        Generate a random lexems sequence using currently loaded model.
        """
        def generate (self):
                buffer = ''
                prefix = tuple ([None] * self.length)

                while prefix in self.model:
                        lexem = self.first_above (self.model[prefix], random.random ())

                        if lexem is None:
                                break

                        buffer += lexem
                        prefix = prefix[1:] + (lexem, )

                return buffer

        """
        Save current model to file.
        """
        def save (self, path):
                with open (path, 'wb') as file:
                        marshal.dump ((self.model, self.length), file)

parser = argparse.ArgumentParser (description = 'Lipsum blabla')
parser.add_argument ('-g', '--generate', type = int, default = 1, help = 'Generate N lines', metavar = 'N')
parser.add_argument ('-l', '--length', type = int, default = 3, help = 'Set prefix length', metavar = 'LEN')
parser.add_argument ('-m', '--model', action = 'store', help = 'Specify path to model', metavar = 'FILE')
parser.add_argument ('-t', '--train', action = 'store', help = 'Train from given file (and save if -m is specified)', metavar = 'FILE')

args = parser.parse_args ()

if args.train is not None:
        lipsum = Lipsum (args.train, args.length)

        if args.model is not None:
                lipsum.save (args.model)

elif args.model is not None:
        lipsum = Lipsum (args.model)

else:
        raise Exception ('please specify either model or train argument')

for i in range (args.generate):
        print lipsum.generate ()