Source code for rhoknp.processors.senter

import logging
import re
import threading
from typing import ClassVar

try:
    from typing import override  # type: ignore[attr-defined]
except ImportError:
    from typing_extensions import override

from rhoknp.processors.processor import Processor
from rhoknp.units import Document, Sentence

logger = logging.getLogger(__name__)



[docs]
class RegexSenter(Processor):
    """正規表現にもとづく文分割クラス．

    Example:
        >>> from rhoknp import RegexSenter
        >>> senter = RegexSenter()
        >>> document = senter.apply("天気が良かったので散歩した。途中で先生に会った。")
    """

    _PERIOD_PAT: ClassVar[re.Pattern] = re.compile(r"[。．？！♪☆★…?!]+")  #: ピリオドとみなすパターン．

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}()"


[docs]
    @override
    def apply_to_document(self, document: Document | str, timeout: int = 10) -> Document:
        """文書に RegexSenter を適用する．

        Args:
            document: 文書．
            timeout: 最大処理時間．．
        """
        if isinstance(document, str):
            document = Document(document)
        doc_id = document.doc_id

        sentences: list[str] = []

        def worker() -> None:
            nonlocal sentences
            sentences = self._split_document(document.text)

        thread = threading.Thread(target=worker, daemon=True)
        thread.start()
        thread.join(timeout)

        if thread.is_alive():
            raise TimeoutError(f"Operation timed out after {timeout} seconds.")

        ret = Document.from_sentences(sentences)
        if doc_id != "":
            ret.doc_id = doc_id
            for sentence in ret.sentences:
                sentence.doc_id = doc_id
        return ret



[docs]
    @override
    def apply_to_sentence(self, sentence: Sentence | str, timeout: int = 10) -> Sentence:
        """文に RegexSenter を適用する．

        Args:
            sentence: 文．
            timeout: 最大処理時間．
        """
        if isinstance(sentence, str):
            sentence = Sentence(sentence)
        return sentence


    def _split_document(self, text: str) -> list[str]:
        if text == "":
            return []

        def split_text_by_period(text: str) -> list[str]:
            segments: list[str] = []
            start: int = 0
            for match in self._PERIOD_PAT.finditer(text):
                end: int = match.end()
                segments.append(text[start:end])
                start = end
            if start < len(text):
                segments.append(text[start:])
            return [segment.strip() for segment in segments]

        sentences: list[str] = []
        for line in text.split("\n"):
            # Split by periods
            sentence_candidates: list[str] = split_text_by_period(line)

            # Merge sentence candidates so that strings in parentheses or brackets are not split
            parenthesis_level: int = 0
            hook_bracket_level: int = 0
            double_hook_bracket_level: int = 0
            sentence: str = ""
            while sentence_candidates:
                sentence_candidate: str = sentence_candidates.pop(0)

                sentence += sentence_candidate

                parenthesis_level += sentence_candidate.count("（") - sentence_candidate.count("）")
                parenthesis_level += sentence_candidate.count("(") - sentence_candidate.count(")")
                hook_bracket_level += sentence_candidate.count("「") - sentence_candidate.count("」")
                double_hook_bracket_level += sentence_candidate.count("『") - sentence_candidate.count("』")
                if parenthesis_level == hook_bracket_level == double_hook_bracket_level == 0:
                    if sentence.strip():
                        sentences.append(sentence.strip())
                    sentence = ""
            if sentence.strip():
                sentences.extend(split_text_by_period(sentence.strip()))

        return sentences