import re
from abc import abstractmethod
from .util import CharCase
from ..scanner import Scanner
from ..util import int_enum
[docs]class TextScanner(Scanner):
"""Text scanner base class.
Attributes
----------
case : `markovchain.text.util.CharCase`
Character case.
"""
[docs] def __init__(self, case=CharCase.LOWER):
"""Text scanner constructor.
Parameters
----------
case : `str` or `int` or `markovchain.text.util.CharCase`, optional
Character case (default: markovchain.text.util.CharCase.LOWER).
"""
super().__init__()
self.case = int_enum(CharCase, case)
def __eq__(self, scanner):
return self.case == scanner.case
[docs] def __call__(self, data, part=False):
"""Scan a string.
Parameters
----------
data : `str`
String to scan.
part : `bool`, optional
True if data is partial (default: `False`).
Returns
-------
`generator` of (`str` or `markovchain.scanner.Scanner.END`)
Token generator.
"""
data = self.case.convert(data)
return self.scan(data, part)
[docs] def save(self):
data = super().save()
data['case'] = self.case
return data
[docs] @abstractmethod
def scan(self, data, part):
"""Scan a string.
Parameters
----------
data : `str`
String to scan.
part : `bool`
True if data is partial.
Returns
-------
`generator` of (`str` or `markovchain.scanner.Scanner.END`)
Token generator.
"""
pass
[docs]class CharScanner(TextScanner):
"""Character scanner.
Attributes
----------
case : `markovchain.text.util.CharCase`
Character case.
end_chars : `str`
Sentence ending characters.
default_end : `str`
Default sentence ending character.
start : `bool`
True if current sentence is started.
end : `bool`
True if current sentence is ended.
Examples
--------
>>> scan = CharScanner()
>>> list(scan('Word'))
['W', 'o', 'r', 'd', '.', Scanner.END]
>>> list(scan('Word', True))
['W', 'o', 'r', 'd']
>>> list(scan(''))
['.', Scanner.END]
"""
[docs] def __init__(self, end_chars='.?!', default_end='.', case=CharCase.LOWER):
"""Character scanner constructor.
Parameters
----------
case : `str` or `int` or `markovchain.text.util.CharCase`, optional
Character case (default: markovchain.text.util.CharCase.LOWER).
end_chars : `str`, optional
Sentence ending characters (default: '.?!').
default_end : `str`, optional
Default sentence ending character (default: '.').
"""
super().__init__(case)
self.end_chars = end_chars
self.default_end = default_end
self.start = False
self.end = False
def __eq__(self, scanner):
return (super().__eq__(scanner)
and self.end_chars == scanner.end_chars
and self.default_end == scanner.default_end)
[docs] def reset(self):
"""Reset scanner state.
"""
self.start = False
self.end = False
[docs] def scan(self, data, part):
"""Scan a string.
Parameters
----------
data : `str`
String to scan.
part : `bool`
True if data is partial.
Returns
-------
`generator` of (`str` or `markovchain.scanner.Scanner.END`)
Token generator.
"""
if not self.end_chars:
yield from data
self.start = self.start or bool(data)
self.end = False
else:
for char in data:
if char in self.end_chars:
if not self.start:
continue
self.end = True
else:
if self.end:
yield self.END
self.end = False
self.start = True
yield char
if not part and self.start:
if not self.end and self.default_end is not None:
yield self.default_end
yield self.END
self.reset()
[docs] def save(self):
"""Convert to JSON.
Returns
-------
`dict`
JSON data.
"""
data = super().save()
data['end_chars'] = self.end_chars
data['default_end'] = self.default_end
return data
[docs]class RegExpScanner(TextScanner):
"""Regular expression scanner.
Attributes
----------
DEFAULT_EXPR : `_sre.SRE_Pattern`
Default regular expression.
case : `markovchain.text.util.CharCase`
Character case.
expr : `_sre.SRE_Pattern`
Regular expression..
default_end : `str`
Default sentence ending string.
end : `bool`
`True` if current sentence is ended.
Examples
--------
>>> scan = RegExpScanner(lambda data: data.split())
>>> list(scan('Word word. word'))
['Word', 'word', '.', Scanner.END, 'word', '.', Scanner.END]
>>> list(scan('word', True))
['word']
>>> list(scan(''))
['.', Scanner.END]
"""
DEFAULT_EXPR = re.compile(
r'(?:(?P<end>[.!?]+)|(?P<word>(?:[^\w\s]+|\w+)))'
)
[docs] def __init__(self, expr=DEFAULT_EXPR, default_end='.', case=CharCase.LOWER):
"""Regular expression scanner constructor.
Parameters
----------
case : `str` or `int` or `markovchain.text.util.CharCase`, optional
Character case (default: markovchain.text.util.CharCase.LOWER).
expr : `str` or `_sre.SRE_Pattern`, optional
Regular expression (default: `markovchain.scanner.RegExpScanner.DEFAULT_EXPR`).
It should have groups 'end' (sentence ending punctuation)
and 'word' (words / other punctuation).
default_end : `str`, optional
Default sentence ending string (default: '.').
"""
super().__init__(case)
self.expr = self.get_regexp(expr)
self.default_end = default_end
self.end = True
def __eq__(self, scanner):
return (super().__eq__(scanner)
and self.expr == scanner.expr
and self.default_end == scanner.default_end)
[docs] def reset(self):
"""Reset scanner state.
"""
self.end = True
[docs] def scan(self, data, part):
"""Scan a string.
Parameters
----------
data : `str`
String to scan.
part : `bool`
`True` if data is partial.
Returns
-------
`generator` of (`str` or `markovchain.scanner.Scanner.END`)
Token generator.
"""
if not self.expr.groups:
for match in self.expr.finditer(data):
yield match.group()
self.end = self.end and not bool(data)
else:
for match in self.expr.finditer(data):
group = self.get_group(match, 'end')
if group is not None:
if not self.end:
yield group
yield self.END
self.end = True
else:
self.end = False
group = self.get_group(match, 'word')
if group is not None:
yield group
else:
yield match.group()
if not part and not self.end:
if self.default_end is not None:
yield self.default_end
yield self.END
self.reset()
[docs] def save(self):
"""Convert the scanner to JSON.
Returns
-------
`dict`
JSON data.
"""
data = super().save()
data['expr'] = self.expr.pattern
data['default_end'] = self.default_end
return data
[docs] @staticmethod
def get_regexp(x):
"""Compile a regular expression if necessary.
Parameters
----------
x : `str` or `_sre.SRE_Pattern`
Regular expression.
Returns
-------
`_sre.SRE_Pattern`
Compiled regular expression.
"""
if isinstance(x, str):
return re.compile(x)
return x
[docs] @staticmethod
def get_group(match, group):
"""Get a group from a regular expression match object if it exists.
Parameters
----------
match : `_sre.SRE_Match`
Regular expression match object.
group : `str` or `int`
Group name or index.
Returns
-------
`str` or `None`
"""
try:
return match.group(group)
except IndexError:
return None