#!/usr/bin/env python import argparse import marshal import random import regex class Lipsum: clean = regex.compile ('\\s+$') """ Create a new lipsum instance by either training a model from given input text file, or loading a previously saved model. Behavior actually depends on whether the 3rd parameter (length) is provided since it's only used when training a new model (but yeah, that's still an ugly hack). """ def __init__ (self, path, length = None): # If length is none we assume input is a trained model if length is None: with open (args.model, 'rb') as file: (self.model, self.length) = marshal.load (file) # Otherwise start training else: self.length = length self.model = {} # Tokenize input string into (prefix -> (suffix, count)) tree tokenize = regex.compile ('((?:[-\\w]+|[^\\n.])\\s*)*(?:[\\n.]\\s*)?') tree = {} with open (path, 'r') as file: for match in tokenize.finditer (file.read ()): lexems = map (self.clean_lexem, match.captures (1)) # Ignore empty sequences if len (lexems) < 1: continue # Register suffixes, including special "end of line" marker prefix = (None, ) * length for lexem in lexems + [None]: suffixes = tree.setdefault (prefix, {}) suffixes[lexem] = suffixes.get (lexem, 0) + 1 prefix = prefix[1:] + (lexem, ) # Convert to (prefix -> (suffix, probability)) model for (key, suffixes) in tree.iteritems (): occurrences = float (sum ((count for (suffix, count) in suffixes.iteritems ()))) thresholds = [] total = 0 for (lexem, count) in suffixes.iteritems (): total += count / occurrences thresholds.append ((lexem, total)) self.model[key] = thresholds """ Cleanup input lexem by squashing all "dirty" characters (see the "clean" regular expression above) into a single space character. """ def clean_lexem (self, lexem): return self.clean.sub (' ', lexem).lower () """ Find first suffix above given value (used for random suffix selection). This method could/should be replaced by some functional call like: bisect (suffixes, lambda suffix: suffix[1], value) """ def first_above (self, suffixes, value): i = 0 while i < len (suffixes) and suffixes[i][1] <= value: i += 1 return i < len (suffixes) and suffixes[i][0] or None """ Generate a random lexems sequence using currently loaded model. """ def generate (self): buffer = '' prefix = tuple ([None] * self.length) while prefix in self.model: lexem = self.first_above (self.model[prefix], random.random ()) if lexem is None: break buffer += lexem prefix = prefix[1:] + (lexem, ) return buffer """ Save current model to file. """ def save (self, path): with open (path, 'wb') as file: marshal.dump ((self.model, self.length), file) parser = argparse.ArgumentParser (description = 'Lipsum blabla') parser.add_argument ('-g', '--generate', type = int, default = 1, help = 'Generate N lines', metavar = 'N') parser.add_argument ('-l', '--length', type = int, default = 3, help = 'Set prefix length', metavar = 'LEN') parser.add_argument ('-m', '--model', action = 'store', help = 'Specify path to model', metavar = 'FILE') parser.add_argument ('-t', '--train', action = 'store', help = 'Train from given file (and save if -m is specified)', metavar = 'FILE') args = parser.parse_args () if args.train is not None: lipsum = Lipsum (args.train, args.length) if args.model is not None: lipsum.save (args.model) elif args.model is not None: lipsum = Lipsum (args.model) else: raise Exception ('please specify either model or train argument') for i in range (args.generate): print lipsum.generate ()