File: lipsum.py - Tab length: 1 2 4 8 - Lines: on off - No wrap: on off

001: #!/usr/bin/env python
002: 
003: import argparse
004: import marshal
005: import random
006: import regex
007: 
008: class Lipsum:
009:         clean = regex.compile ('\\s+$')
010: 
011:         """
012:         Create a new lipsum instance by either training a model from given input
013:         text file, or loading a previously saved model. Behavior actually depends
014:         on whether the 3rd parameter (length) is provided since it's only used when
015:         training a new model (but yeah, that's still an ugly hack).
016:         """
017:         def __init__ (self, path, length = None):
018:                 # If length is none we assume input is a trained model
019:                 if length is None:
020:                         with open (args.model, 'rb') as file:
021:                                 (self.model, self.length) = marshal.load (file)
022: 
023:                 # Otherwise start training
024:                 else:
025:                         self.length = length
026:                         self.model = {}
027: 
028:                         # Tokenize input string into (prefix -> (suffix, count)) tree
029:                         tokenize = regex.compile ('((?:[-\\w]+|[^\\n.])\\s*)*(?:[\\n.]\\s*)?')
030:                         tree = {}
031: 
032:                         with open (path, 'r') as file:
033:                                 for match in tokenize.finditer (file.read ()):
034:                                         lexems = map (self.clean_lexem, match.captures (1))
035: 
036:                                         # Ignore empty sequences
037:                                         if len (lexems) < 1:
038:                                                 continue
039: 
040:                                         # Register suffixes, including special "end of line" marker
041:                                         prefix = (None, ) * length
042: 
043:                                         for lexem in lexems + [None]:
044:                                                 suffixes = tree.setdefault (prefix, {})
045:                                                 suffixes[lexem] = suffixes.get (lexem, 0) + 1
046: 
047:                                                 prefix = prefix[1:] + (lexem, )
048: 
049:                         # Convert to (prefix -> (suffix, probability)) model
050:                         for (key, suffixes) in tree.iteritems ():
051:                                 occurrences = float (sum ((count for (suffix, count) in suffixes.iteritems ())))
052:                                 thresholds = []
053:                                 total = 0
054: 
055:                                 for (lexem, count) in suffixes.iteritems ():
056:                                         total += count / occurrences
057: 
058:                                         thresholds.append ((lexem, total))
059: 
060:                                 self.model[key] = thresholds
061: 
062:         """
063:         Cleanup input lexem by squashing all "dirty" characters (see the "clean"
064:         regular expression above) into a single space character.
065:         """
066:         def clean_lexem (self, lexem):
067:                 return self.clean.sub (' ', lexem).lower ()
068: 
069:         """
070:         Find first suffix above given value (used for random suffix selection).
071:         This method could/should be replaced by some functional call like:
072:         bisect (suffixes, lambda suffix: suffix[1], value)
073:         """
074:         def first_above (self, suffixes, value):
075:                 i = 0
076: 
077:                 while i < len (suffixes) and suffixes[i][1] <= value:
078:                         i += 1
079: 
080:                 return i < len (suffixes) and suffixes[i][0] or None
081: 
082:         """
083:         Generate a random lexems sequence using currently loaded model.
084:         """
085:         def generate (self):
086:                 buffer = ''
087:                 prefix = tuple ([None] * self.length)
088: 
089:                 while prefix in self.model:
090:                         lexem = self.first_above (self.model[prefix], random.random ())
091: 
092:                         if lexem is None:
093:                                 break
094: 
095:                         buffer += lexem
096:                         prefix = prefix[1:] + (lexem, )
097: 
098:                 return buffer
099: 
100:         """
101:         Save current model to file.
102:         """
103:         def save (self, path):
104:                 with open (path, 'wb') as file:
105:                         marshal.dump ((self.model, self.length), file)
106: 
107: parser = argparse.ArgumentParser (description = 'Lipsum blabla')
108: parser.add_argument ('-g', '--generate', type = int, default = 1, help = 'Generate N lines', metavar = 'N')
109: parser.add_argument ('-l', '--length', type = int, default = 3, help = 'Set prefix length', metavar = 'LEN')
110: parser.add_argument ('-m', '--model', action = 'store', help = 'Specify path to model', metavar = 'FILE')
111: parser.add_argument ('-t', '--train', action = 'store', help = 'Train from given file (and save if -m is specified)', metavar = 'FILE')
112: 
113: args = parser.parse_args ()
114: 
115: if args.train is not None:
116:         lipsum = Lipsum (args.train, args.length)
117: 
118:         if args.model is not None:
119:                 lipsum.save (args.model)
120: 
121: elif args.model is not None:
122:         lipsum = Lipsum (args.model)
123: 
124: else:
125:         raise Exception ('please specify either model or train argument')
126: 
127: for i in range (args.generate):
128:         print lipsum.generate ()