Source code for rhoknp.processors.senter
import logging
import re
import threading
from typing import ClassVar
try:
from typing import override # type: ignore[attr-defined]
except ImportError:
from typing_extensions import override
from rhoknp.processors.processor import Processor
from rhoknp.units import Document, Sentence
logger = logging.getLogger(__name__)
[docs]
class RegexSenter(Processor):
"""正規表現にもとづく文分割クラス.
Example:
>>> from rhoknp import RegexSenter
>>> senter = RegexSenter()
>>> document = senter.apply("天気が良かったので散歩した。途中で先生に会った。")
"""
_PERIOD_PAT: ClassVar[re.Pattern] = re.compile(r"[。.?!♪☆★…?!]+") #: ピリオドとみなすパターン.
def __repr__(self) -> str:
return f"{self.__class__.__name__}()"
[docs]
@override
def apply_to_document(self, document: Document | str, timeout: int = 10) -> Document:
"""文書に RegexSenter を適用する.
Args:
document: 文書.
timeout: 最大処理時間..
"""
if isinstance(document, str):
document = Document(document)
doc_id = document.doc_id
sentences: list[str] = []
def worker() -> None:
nonlocal sentences
sentences = self._split_document(document.text)
thread = threading.Thread(target=worker, daemon=True)
thread.start()
thread.join(timeout)
if thread.is_alive():
raise TimeoutError(f"Operation timed out after {timeout} seconds.")
ret = Document.from_sentences(sentences)
if doc_id != "":
ret.doc_id = doc_id
for sentence in ret.sentences:
sentence.doc_id = doc_id
return ret
[docs]
@override
def apply_to_sentence(self, sentence: Sentence | str, timeout: int = 10) -> Sentence:
"""文に RegexSenter を適用する.
Args:
sentence: 文.
timeout: 最大処理時間.
"""
if isinstance(sentence, str):
sentence = Sentence(sentence)
return sentence
def _split_document(self, text: str) -> list[str]:
if text == "":
return []
def split_text_by_period(text: str) -> list[str]:
segments: list[str] = []
start: int = 0
for match in self._PERIOD_PAT.finditer(text):
end: int = match.end()
segments.append(text[start:end])
start = end
if start < len(text):
segments.append(text[start:])
return [segment.strip() for segment in segments]
sentences: list[str] = []
for line in text.split("\n"):
# Split by periods
sentence_candidates: list[str] = split_text_by_period(line)
# Merge sentence candidates so that strings in parentheses or brackets are not split
parenthesis_level: int = 0
hook_bracket_level: int = 0
double_hook_bracket_level: int = 0
sentence: str = ""
while sentence_candidates:
sentence_candidate: str = sentence_candidates.pop(0)
sentence += sentence_candidate
parenthesis_level += sentence_candidate.count("(") - sentence_candidate.count(")")
parenthesis_level += sentence_candidate.count("(") - sentence_candidate.count(")")
hook_bracket_level += sentence_candidate.count("「") - sentence_candidate.count("」")
double_hook_bracket_level += sentence_candidate.count("『") - sentence_candidate.count("』")
if parenthesis_level == hook_bracket_level == double_hook_bracket_level == 0:
if sentence.strip():
sentences.append(sentence.strip())
sentence = ""
if sentence.strip():
sentences.extend(split_text_by_period(sentence.strip()))
return sentences