Source code for markovchain.text.rank

import sys
import math
from abc import abstractmethod
from itertools import chain, islice

from ..util import SaveLoad
from ..text.util import get_words

[docs]class Rank(SaveLoad): """Base text rank class. Attributes ---------- size : `int` remove : `float` debug : `bool` If True, enable debug output. """ classes = {}
[docs] def __init__(self, size=10, remove=0.5): if size <= 0: raise ValueError('rank size <= 0') self.size = size self.remove = remove self.debug = False
def __eq__(self, rank): return (super().__eq__(self, rank) and self.size == rank.size and abs(self.remove - rank.remove) < 1e-5)
[docs] def save(self): ret = super().save() ret['size'] = self.size ret['remove'] = self.remove return ret
[docs] @abstractmethod def rank(self, string): """Rank a string. Parameters ---------- string : `str` Returns ------- `float` """ pass
[docs] def __call__(self, strings): """Filter strings by rank. Parameters ---------- strings : `iterable` of `str` Strings to filter. Returns ------- `list` of `str` Filtered list. """ strings = sorted( ((string, self.rank(string)) for string in strings), key=lambda x: -x[1] ) end = max(1, len(strings) - int(self.remove * len(strings))) res = [string for string, rank in islice(strings, 0, end)] if self.debug: print(res, file=sys.stderr) return res
[docs]class Const(Rank): """Constant text rank."""
[docs] def __init__(self, **_): super().__init__(1, 0.0)
[docs] def rank(self, string): return 1
[docs]class Test(Rank):
[docs] def __init__(self, size, remove): super().__init__(size=10, remove=0.5) self.header = False self.opt_words = 8 self.opt_long_words = 4 self.opt_long_word_ratio = 0.6 self.long_word_length = 4
[docs] def features(self, string): words = get_words(string) nwords = len(words) nlongwords = sum( 1 for word in words if len(word) >= self.long_word_length ) return [ 1 - abs(1 - nwords / self.opt_words), 1 - abs(1 - nlongwords / self.opt_long_words) ** 2, #len(set(words)) / nwords, 1 - abs(1 - nlongwords / nwords / self.opt_long_word_ratio) ]
[docs] def log(self, res, features, string): if not self.header: self.header = True fmt = ' Rank ' + ' %02d ' * len(features) print(fmt % tuple(range(len(features))), file=sys.stderr) fmt = '%.04f ' * (len(features) + 1) + '%s' print(fmt % tuple(chain((res,), features, (string,))), file=sys.stderr)
[docs] def rank(self, string): features = self.features(string) for i, x in enumerate(features): features[i] = min(1, max(0, x)) ret = sum(features) / len(features) if self.debug: self.log(ret, features, string) return ret
[docs] def __call__(self, strings): self.header = False return super().__call__(strings)