001: #!/usr/bin/env python
002:
003: import argparse
004: import marshal
005: import random
006: import regex
007:
008: class Lipsum:
009: clean = regex.compile ('\\s+$')
010:
011: """
012: Create a new lipsum instance by either training a model from given input
013: text file, or loading a previously saved model. Behavior actually depends
014: on whether the 3rd parameter (length) is provided since it's only used when
015: training a new model (but yeah, that's still an ugly hack).
016: """
017: def __init__ (self, path, length = None):
018: # If length is none we assume input is a trained model
019: if length is None:
020: with open (args.model, 'rb') as file:
021: (self.model, self.length) = marshal.load (file)
022:
023: # Otherwise start training
024: else:
025: self.length = length
026: self.model = {}
027:
028: # Tokenize input string into (prefix -> (suffix, count)) tree
029: tokenize = regex.compile ('((?:[-\\w]+|[^\\n.])\\s*)*(?:[\\n.]\\s*)?')
030: tree = {}
031:
032: with open (path, 'r') as file:
033: for match in tokenize.finditer (file.read ()):
034: lexems = map (self.clean_lexem, match.captures (1))
035:
036: # Ignore empty sequences
037: if len (lexems) < 1:
038: continue
039:
040: # Register suffixes, including special "end of line" marker
041: prefix = (None, ) * length
042:
043: for lexem in lexems + [None]:
044: suffixes = tree.setdefault (prefix, {})
045: suffixes[lexem] = suffixes.get (lexem, 0) + 1
046:
047: prefix = prefix[1:] + (lexem, )
048:
049: # Convert to (prefix -> (suffix, probability)) model
050: for (key, suffixes) in tree.iteritems ():
051: occurrences = float (sum ((count for (suffix, count) in suffixes.iteritems ())))
052: thresholds = []
053: total = 0
054:
055: for (lexem, count) in suffixes.iteritems ():
056: total += count / occurrences
057:
058: thresholds.append ((lexem, total))
059:
060: self.model[key] = thresholds
061:
062: """
063: Cleanup input lexem by squashing all "dirty" characters (see the "clean"
064: regular expression above) into a single space character.
065: """
066: def clean_lexem (self, lexem):
067: return self.clean.sub (' ', lexem).lower ()
068:
069: """
070: Find first suffix above given value (used for random suffix selection).
071: This method could/should be replaced by some functional call like:
072: bisect (suffixes, lambda suffix: suffix[1], value)
073: """
074: def first_above (self, suffixes, value):
075: i = 0
076:
077: while i < len (suffixes) and suffixes[i][1] <= value:
078: i += 1
079:
080: return i < len (suffixes) and suffixes[i][0] or None
081:
082: """
083: Generate a random lexems sequence using currently loaded model.
084: """
085: def generate (self):
086: buffer = ''
087: prefix = tuple ([None] * self.length)
088:
089: while prefix in self.model:
090: lexem = self.first_above (self.model[prefix], random.random ())
091:
092: if lexem is None:
093: break
094:
095: buffer += lexem
096: prefix = prefix[1:] + (lexem, )
097:
098: return buffer
099:
100: """
101: Save current model to file.
102: """
103: def save (self, path):
104: with open (path, 'wb') as file:
105: marshal.dump ((self.model, self.length), file)
106:
107: parser = argparse.ArgumentParser (description = 'Lipsum blabla')
108: parser.add_argument ('-g', '--generate', type = int, default = 1, help = 'Generate N lines', metavar = 'N')
109: parser.add_argument ('-l', '--length', type = int, default = 3, help = 'Set prefix length', metavar = 'LEN')
110: parser.add_argument ('-m', '--model', action = 'store', help = 'Specify path to model', metavar = 'FILE')
111: parser.add_argument ('-t', '--train', action = 'store', help = 'Train from given file (and save if -m is specified)', metavar = 'FILE')
112:
113: args = parser.parse_args ()
114:
115: if args.train is not None:
116: lipsum = Lipsum (args.train, args.length)
117:
118: if args.model is not None:
119: lipsum.save (args.model)
120:
121: elif args.model is not None:
122: lipsum = Lipsum (args.model)
123:
124: else:
125: raise Exception ('please specify either model or train argument')
126:
127: for i in range (args.generate):
128: print lipsum.generate ()