hojichar.filters.document_filters

   1import json
   2import logging
   3import pathlib
   4import re
   5import string
   6import time
   7import unicodedata
   8from collections import Counter
   9from itertools import groupby
  10from os import PathLike
  11from typing import Any, Dict, Iterable, List, Optional, Union
  12
  13import numpy as np
  14
  15import hojichar
  16from hojichar.core.filter_interface import Filter
  17from hojichar.core.models import Document, Token
  18
  19try:
  20    import emoji
  21    from fugashi import Tagger  # type: ignore
  22
  23    is_loaded_extras = True
  24except ImportError:
  25    is_loaded_extras = False
  26
  27BASE_PATH = pathlib.Path(hojichar.__path__[0])
  28logger = logging.getLogger(__name__)
  29
  30
  31class ExampleHojiChar(Filter):
  32    """基本的なフィルタの実装例です. 末尾に'<hojichar>'を追加します."""
  33
  34    def apply(self, document: Document) -> Document:
  35        """
  36        >>> ExampleHojiChar()("hello, world")
  37        'hello, world<hojichar>'
  38        """
  39        document.text += "<hojichar>"
  40        return document
  41
  42
  43class ExampleDiscardDocumentContainKeyword(Filter):
  44    """特定のキーワードを持つドキュメントを破棄するようなフィルタの実装例です."""
  45
  46    def __init__(self, keyword: str, *args: Any, **kwargs: Any) -> None:
  47        super().__init__(*args, **kwargs)
  48        self.keyword = keyword
  49
  50    def apply(self, document: Document) -> Document:
  51        """
  52        >>> ExampleDiscardDocumentContainKeyword("バカ").apply(Document("あいつはバカだ")).is_rejected
  53        True
  54        """
  55        if self.keyword in document.text:
  56            document.is_rejected = True
  57        return document
  58
  59
  60class Identity(Filter):
  61    """何も変化を加えないフィルタです. テスト・デバッグに用いられます."""
  62
  63    def apply(self, document: Document) -> Document:
  64        return document
  65
  66
  67class DiscardAll(Filter):
  68    """
  69    すべてのドキュメントを破棄するフィルタです.
  70    テスト・デバッグに用いられます.
  71    """
  72
  73    def apply(self, document: Document) -> Document:
  74        document.is_rejected = True
  75        return document
  76
  77
  78class ApplyDiscard(Filter):
  79    """
  80    上流フィルタで破棄された`Document`を空文字列にします.
  81
  82    `Document.is_rejected=True` の ドキュメントは無視されるため,
  83    このフィルタを `Compose` のコンストラクタに渡しても動作しません.
  84    このフィルタは主に`Compose` 内部や, `discard_filtered=False` を指定
  85    したデバッグ時などに利用されます.
  86    """
  87
  88    def __init__(self, *args: Any, **kwargs: Any) -> None:
  89        super().__init__(*args, **kwargs)
  90
  91    def apply(self, document: Document) -> Document:
  92        """
  93        >>> ApplyDiscard().apply(Document(text="hello", is_rejected=True)).text
  94        ''
  95        """
  96        if document.is_rejected:
  97            document.text = ""
  98
  99        return document
 100
 101
 102class Sleep(Filter):
 103    """
 104    デバッグ用のフィルタです. 指定秒スリープします.
 105    """
 106
 107    def __init__(self, time: float = 1.0, *args: Any, **kwargs: Any) -> None:
 108        super().__init__(*args, **kwargs)
 109        self.time = time
 110
 111    def apply(self, document: Document) -> Document:
 112        """
 113        >>> Sleep(0.1)('hello')  # After 0.1 seconds,
 114        'hello'
 115        """
 116        time.sleep(self.time)
 117        return document
 118
 119
 120class DocumentNormalizer(Filter):
 121    """
 122    Unicode の正規化をします.
 123    """
 124
 125    def __init__(self, *args: Any, **kwargs: Any) -> None:
 126        super().__init__(*args, **kwargs)
 127
 128    def apply(self, document: Document) -> Document:
 129        document.text = unicodedata.normalize("NFKC", document.text)
 130        return document
 131
 132
 133class JSONLoader(Filter):
 134    """
 135    テキストを Json として解釈し, `key` で指定した要素を文字列として
 136    doument に格納します.デフォルトの `key` は 'text' です.
 137
 138    Json の読み込み, あるいは `key` の読み込みに失敗した際には例外を送出します.
 139    これらを無視する場合は, `ignore=True` にします. その際, 読み込みに失敗
 140    したドキュメントは破棄されます.
 141
 142    入力 Json に `extras` キー(辞書形式)が含まれている場合, Document.extras に自動的にマージされます。
 143    さらに `extra_keys` でフィールドを指定すると, それらの値も Document.extras に追記され, 既存の extras
 144    を上書きせずに統合できます。
 145    """
 146
 147    def __init__(
 148        self,
 149        key: str = "text",
 150        ignore: bool = False,
 151        extra_keys: Optional[List[str]] = None,
 152        *args: Any,
 153        **kwargs: Any,
 154    ) -> None:
 155        super().__init__(*args, **kwargs)
 156        self.key = key
 157        self.ignore = ignore
 158        self.extra_keys = extra_keys
 159
 160    def apply(self, document: Document) -> Document:
 161        """
 162        >>> JSONLoader()( '{"text": "hello, world", "words": 2}' )
 163        'hello, world'
 164
 165        >>> JSONLoader()( '{"text": hello, world ....' ) # Broken JSON
 166        Traceback (most recent call last):
 167            ...
 168        json.decoder.JSONDecodeError: Expecting value: line 1 column 10 (char 9)
 169
 170        >>> JSONLoader()( '{"words": 2}' )
 171        Traceback (most recent call last):
 172            ...
 173        KeyError: 'text'
 174
 175        >>> JSONLoader(ignore=True).apply(Document('{"text": hello, world ....' )).is_rejected
 176        True
 177        """
 178        try:
 179            data = json.loads(document.text)
 180            document.text = str(data[self.key])
 181            if "extras" in data and isinstance(data["extras"], dict):
 182                document.extras.update(data["extras"])
 183            if self.extra_keys is not None:
 184                for key in self.extra_keys:
 185                    if key not in data:
 186                        continue
 187                    if key == "extras" and isinstance(data[key], dict):
 188                        document.extras.update(data[key])
 189                    else:
 190                        document.extras[key] = data[key]
 191        except Exception as e:
 192            logger.error(f"Failed to parsing in JSONLoader. Input document: \n{document.text}")
 193            if self.ignore:
 194                document.is_rejected = True
 195                return document
 196            else:
 197                raise e
 198
 199        return document
 200
 201
 202class JSONDumper(Filter):
 203    """
 204    Document.text の文字列を json に変換します.
 205    必要に応じ Document のメタデータを付与します. これはドキュメントの破棄事由が含まれ、偽陽性の分析に有効です。
 206    デフォルトで `skip_rejected` が `False` にセットされており、Document の破棄フラグにかかわらず
 207    処理されます。
 208    """
 209
 210    def __init__(
 211        self,
 212        dump_reason: bool = False,
 213        p: float = 1,
 214        skip_rejected: bool = False,
 215        export_extras: bool = False,
 216        *args: Any,
 217        **kwargs: Any,
 218    ) -> None:
 219        """
 220        Args:
 221            dump_reason (bool, optional): `is_rejected`, `reason` エントリをダンプします. Defaults to False.
 222            p (float, optional): Apply probability. Defaults to 1.
 223            skip_rejected (bool, optional): 破棄済みサンプルを排除しません.
 224        """
 225        super().__init__(p, skip_rejected, *args, **kwargs)
 226        self.dump_reason = dump_reason
 227        self.export_extras = export_extras
 228
 229    def apply(self, document: Document) -> Document:
 230        """
 231        >>> JSONDumper()("hojichar")
 232        '{"text": "hojichar"}'
 233        """
 234        text = document.text
 235        if self.dump_reason:
 236            if self.export_extras:
 237                output_extras = dict(document.extras)
 238                document.text = json.dumps(
 239                    {
 240                        "text": text,
 241                        "is_rejected": document.is_rejected,
 242                        "reason": document.reject_reason,
 243                        "extras": output_extras,
 244                    },
 245                    ensure_ascii=False,
 246                )
 247            else:
 248                document.text = json.dumps(
 249                    {
 250                        "text": text,
 251                        "is_rejected": document.is_rejected,
 252                        "reason": document.reject_reason,
 253                    },
 254                    ensure_ascii=False,
 255                )
 256        else:
 257            if self.export_extras:
 258                output_extras = dict(document.extras)
 259                document.text = json.dumps(
 260                    {
 261                        "text": text,
 262                        "extras": output_extras,
 263                    },
 264                    ensure_ascii=False,
 265                )
 266            else:
 267                document.text = json.dumps({"text": text}, ensure_ascii=False)
 268        return document
 269
 270
 271class DocumentLengthFilter(Filter):
 272    """
 273    `min_doc_len`, `max_doc_len` で指定した上限・下限の範囲内にないドキュメントを破棄します.
 274    デフォルトでは 200字 以上 50000字以内のテキストが受理されます.
 275    """
 276
 277    def __init__(
 278        self,
 279        min_doc_len: Optional[int] = None,
 280        max_doc_len: Optional[int] = None,
 281        *args: Any,
 282        **kwargs: Any,
 283    ) -> None:
 284        super().__init__(*args, **kwargs)
 285
 286        self.min_doc_len = min_doc_len
 287        self.max_doc_len = max_doc_len
 288
 289    def apply(self, doc: Document) -> Document:
 290        """
 291        >>> DocumentLengthFilter(min_doc_len=5).apply(Document("1234")).is_rejected
 292        True
 293        """
 294        doc_len = len(doc.text)
 295        if self.min_doc_len is not None:
 296            if doc_len < self.min_doc_len:
 297                doc.is_rejected = True
 298        if self.max_doc_len is not None:
 299            if self.max_doc_len < doc_len:
 300                doc.is_rejected = True
 301        return doc
 302
 303
 304class NgWordsFilterJa(Filter):
 305    """
 306    日本語のNGワード(および不適切語)を含む文書を破棄します.
 307    `dict_path` で指定したファイルから, キーワードのリストを得ます.
 308    ファイルは単語が改行で羅列されたテキストファイルです.
 309
 310    `ignore_confused` を `True` にすると,
 311    偽陽性を軽減するために, カタカナのNGワードは前後にカタカナが無い場合のみNG判定されます.
 312    デフォルト値は `False` です.
 313    """
 314
 315    def __init__(
 316        self,
 317        dict_path: Union[str, PathLike],
 318        ignore_confused: bool = False,
 319        *args: Any,
 320        **kwargs: Any,
 321    ) -> None:
 322        super().__init__(*args, **kwargs)
 323
 324        with open(dict_path, encoding="utf-8") as fp:
 325            ng_words = fp.read().split("\n")
 326        ng_words = [w.strip() for w in ng_words if not len(w) == 0]
 327
 328        if ignore_confused:
 329            words_katakana = []
 330            words_not_katakana = []
 331            for w in ng_words:
 332                if re.fullmatch(r"[ァ-ヴー]+", w):
 333                    words_katakana.append(re.escape(w))
 334                else:
 335                    words_not_katakana.append(re.escape(w))
 336            katakana_pat = "|".join(words_katakana)
 337            katakana_pat = rf"(?<![ァ-ヴー])({katakana_pat})(?![ァ-ヴー])"
 338            pat = "|".join(words_not_katakana) + "|" + katakana_pat
 339            self.keyword_pat = re.compile(pat)
 340        else:
 341            ng_words = [re.escape(w) for w in ng_words]
 342            pat = "|".join(ng_words)
 343            self.keyword_pat = re.compile(pat)
 344
 345    def apply(self, doc: Document) -> Document:
 346        regex_match = self.keyword_pat.search(doc.text)
 347        if regex_match:
 348            doc.is_rejected = True
 349            self.matched_text = regex_match.group()
 350            self.matched_text_neighbor = doc.text[
 351                regex_match.start() - 20 : regex_match.end() + 20
 352            ]
 353
 354        return doc
 355
 356
 357class NgWordsFilterEn(Filter):
 358    """
 359    英語のNGワード(および不適切語)を含む文書を破棄します.
 360    `dict_path` で指定したファイルから, キーワードのリストを得ます.
 361    ファイルは単語が改行で羅列されたテキストファイルです.
 362    """
 363
 364    def __init__(self, dict_path: Union[str, PathLike], *args: Any, **kwargs: Any) -> None:
 365        super().__init__(*args, **kwargs)
 366
 367        with open(dict_path, encoding="utf-8") as fp:
 368            ng_words = fp.read().split("\n")
 369        ng_words = [re.escape(w.strip()) for w in ng_words if not len(w) == 0]
 370        pat = "|".join(ng_words)
 371        # 英語のパターンにマッチするようにしている, \s[単語]\s や [単語]. [単語], などにマッチ.
 372        self.keyword_pat = re.compile(rf"(?:^| )({pat})(?:( |,|\.)|$)", re.IGNORECASE)
 373
 374    def apply(self, doc: Document) -> Document:
 375        if self.keyword_pat.search(doc.text):
 376            doc.is_rejected = True
 377        return doc
 378
 379
 380class DiscardAdultContentJa(NgWordsFilterJa):
 381    """
 382    日本語のアダルトキーワード(および不適切語)を含む文書を破棄します.
 383    `dict_path` で指定したファイルから, キーワードのリストを得ます.
 384    ファイルは単語が改行で羅列されたテキストファイルです.
 385    デフォルトの`dict_path` は /hojichar/dict/adult_keywords_ja.txt です.
 386    """
 387
 388    def __init__(
 389        self,
 390        dict_path: Union[str, PathLike] = BASE_PATH / "dict/adult_keywords_ja.txt",
 391        *args: Any,
 392        **kwargs: Any,
 393    ) -> None:
 394        super().__init__(dict_path, *args, **kwargs)
 395
 396    def apply(self, doc: Document) -> Document:
 397        """
 398        >>> DiscardAdultContentJa().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected
 399        True
 400
 401        >>> DiscardAdultContentJa().apply(Document("ほうじ茶")).is_rejected
 402        False
 403
 404        挙動は正しいが誤検知しているケース. 他にも, サック in リュックサック,
 405        >>> DiscardAdultContentJa().apply(Document("アスパラガス")).is_rejected \
 406        # Matching with NG keyword "アス"
 407        True
 408        """
 409        return super().apply(doc)
 410
 411
 412class DiscardAdultContentEn(NgWordsFilterEn):
 413    """
 414    英語のアダルトキーワード(および不適切語)を含む文書を破棄します.
 415    `dict_path` で指定したファイルから, キーワードのリストを得ます.
 416    ファイルは単語が改行で羅列されたテキストファイルです.
 417    デフォルトの`dict_path` は /hojichar/dict/adult_keywords_en.txt です.
 418    """
 419
 420    def __init__(
 421        self,
 422        dict_path: Union[str, PathLike] = BASE_PATH / "dict/adult_keywords_en.txt",
 423        *args: Any,
 424        **kwargs: Any,
 425    ) -> None:
 426        super().__init__(dict_path, *args, **kwargs)
 427
 428    def apply(self, doc: Document) -> Document:
 429        """
 430        >>> DiscardAdultContentEn().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected
 431        True
 432
 433        >>> DiscardAdultContentEn().apply(Document("hojichar")).is_rejected
 434        False
 435        """
 436        return super().apply(doc)
 437
 438
 439class DiscardDiscriminationContentJa(NgWordsFilterJa):
 440    """
 441    日本語の差別キーワード(および不適切語)を含む文書を破棄します.
 442    `dict_path` で指定したファイルから, キーワードのリストを得ます.
 443    ファイルは単語が改行で羅列されたテキストファイルです.
 444    デフォルトの`dict_path` は /hojichar/dict/discrimination_keywords_ja.txt です.
 445    """
 446
 447    def __init__(
 448        self,
 449        dict_path: Union[str, PathLike] = BASE_PATH / "dict/discrimination_keywords_ja.txt",
 450        *args: Any,
 451        **kwargs: Any,
 452    ):
 453        super().__init__(dict_path, *args, **kwargs)
 454
 455    def apply(self, doc: Document) -> Document:
 456        """
 457        >>> DiscardDiscriminationContentJa().\
 458            apply(Document("<TEST_STRING_OF_DISCRIMINATION_KEYWORD>")).is_rejected
 459        True
 460
 461        >>> DiscardDiscriminationContentJa().apply(Document("ほうじ茶")).is_rejected
 462        False
 463        """
 464        return super().apply(doc)
 465
 466
 467class DiscardViolenceContentJa(NgWordsFilterJa):
 468    """
 469    日本語の暴力・脅迫を示唆するキーワードを含む文書を破棄します.
 470    `dict_path` で指定したファイルから, キーワードのリストを得ます.
 471    ファイルは単語が改行で羅列されたテキストファイルです.
 472    デフォルトの`dict_path` は /hojichar/dict/violence_keywords_ja.txt です.
 473    """
 474
 475    def __init__(
 476        self,
 477        dict_path: Union[str, PathLike] = BASE_PATH / "dict/violence_keywords_ja.txt",
 478        *args: Any,
 479        **kwargs: Any,
 480    ) -> None:
 481        super().__init__(dict_path, *args, **kwargs)
 482
 483    def apply(self, doc: Document) -> Document:
 484        """
 485        >>> DiscardViolenceContentJa()\
 486            .apply(Document("<TEST_STRING_OF_VIOLENCE_KEYWORD>")).is_rejected
 487        True
 488
 489        >>> DiscardViolenceContentJa().apply(Document("ほうじ茶")).is_rejected
 490        False
 491        """
 492        return super().apply(doc)
 493
 494
 495class DiscardBBSComments(Filter):
 496    """
 497    正規表現 "BBS Pattern" に `max_allow_num` 回よりたくさんマッチする文書を破棄します.
 498    `max_allow_num` のデフォルト値は14です.
 499    正規表現 "BBS Pattern" は下記のリンクで検証可能です.
 500    https://regex101.com/r/ybQvL2/1
 501    """
 502
 503    def __init__(self, max_allowed_num: int = 14, *args: Any, **kwargs: Any) -> None:
 504        super().__init__(*args, **kwargs)
 505
 506        self.max_allowed_num = max_allowed_num
 507        self.keyword_pat = re.compile(
 508            r"\d{4}[年\.\-\/][\ ]*\d{1,2}[月\.\-\/][\ ]*\d{1,2}[日]*|コメント|SOLD OUT|レビュー|投稿|ページ|\([月火水木金土日]\)|質問|\d+話|楽天市場|-"  # noqa
 509        )
 510
 511    def apply(self, doc: Document) -> Document:
 512        """
 513        >>> DiscardBBSComments().apply(Document("楽天市場 質問 投稿 コメント レビュー "*3)).is_rejected
 514        True
 515
 516        >>> DiscardBBSComments().apply(Document("鏡餅")).is_rejected
 517        False
 518        """
 519        bbs_factor = self.keyword_pat.findall(doc.text)
 520        if len(bbs_factor) > self.max_allowed_num:
 521            doc.is_rejected = True
 522        return doc
 523
 524
 525class DiscardAds(Filter):
 526    """
 527    主に広告キーワードを`max_allow_num`より多く含む文書を破棄します.
 528    デフォルトで`max_allow_num` は14です.
 529    `dict_path` で指定したファイルから, 広告キーワードのリストを得ます.
 530    ファイルは単語が改行で羅列されたテキストファイルです.
 531    デフォルトの`dict_path` は /hojichar/dict/advertisement_keywords_ja.txt です.
 532    """
 533
 534    def __init__(
 535        self,
 536        dict_path: Union[str, PathLike] = BASE_PATH / "dict/advertisement_keywords_ja.txt",
 537        max_allowed_num: int = 14,
 538        *args: Any,
 539        **kwargs: Any,
 540    ):
 541        super().__init__(*args, **kwargs)
 542
 543        self.max_allow_num = max_allowed_num
 544        with open(dict_path, encoding="utf-8") as fp:
 545            ng_words = fp.read().split("\n")
 546        ng_words = [re.escape(w.strip()) for w in ng_words if not len(w) == 0]
 547        pat = r"|".join(ng_words)
 548        self.keyword_pat = re.compile(pat)
 549
 550    def apply(self, doc: Document) -> Document:
 551        """
 552        >>> DiscardAds().apply(Document("お問い合わせください 営業時間 よくある質問"*5)).is_rejected
 553        True
 554
 555        >>> DiscardAds().apply(Document("おはよう")).is_rejected
 556        False
 557        """
 558        ads_factor = self.keyword_pat.findall(doc.text)
 559        if len(ads_factor) > self.max_allow_num:
 560            doc.is_rejected = True
 561        return doc
 562
 563
 564class AcceptJapanese(Filter):
 565    """
 566    日本語でないドキュメントを破棄します. 日本語判定は次の手順で行われます.
 567        1. テキストを左から`lookup_size` (デフォルトで50字) 参照し,
 568        ひらがな・カタカナが存在すれば日本語と判定する.
 569    """
 570
 571    def __init__(self, lookup_size: int = 50, *args: Any, **kwargs: Any) -> None:
 572        super().__init__(*args, **kwargs)
 573
 574        self.lookup_size = lookup_size
 575        self.hiragana_katakana_pat = re.compile(r"[ぁ-んァ-ン]")
 576
 577    def apply(self, doc: Document) -> Document:
 578        """
 579        >>> AcceptJapanese().apply(Document("This is English document")).is_rejected
 580        True
 581
 582        >>> AcceptJapanese().apply(Document("a"*50 + "あ")).is_rejected
 583        True
 584
 585        >>> AcceptJapanese().apply(Document("ほうじ茶")).is_rejected
 586        False
 587        """
 588        if not self.hiragana_katakana_pat.search(doc.text[: self.lookup_size]):
 589            doc.is_rejected = True
 590        return doc
 591
 592
 593class DiscardRareKuten(Filter):
 594    """
 595    日本語でないドキュメントを破棄します. 日本語判定は次の手順で行われます
 596    ドキュメントを句点"。"で区切り, 平均文長が
 597    `max_avarage_sentence_length` より長い場合は破棄します.
 598    `max_avarage_sentence_length` のデフォルト値は100です.
 599    このフィルタは, 文章中の句点の割合が少なすぎるドキュメントを破棄します.
 600    """
 601
 602    def __init__(self, max_average_sentence_length: int = 100, *args: Any, **kwargs: Any) -> None:
 603        super().__init__(*args, **kwargs)
 604
 605        self.max_average_sentence_length = max_average_sentence_length
 606        self.kuten_pat = re.compile(r"。")
 607
 608    def apply(self, doc: Document) -> Document:
 609        """
 610        >>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよ。")).is_rejected
 611        False
 612        >>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよう。")).is_rejected
 613        True
 614        """
 615        kuten_lst = self.kuten_pat.findall(doc.text)
 616        min_kuten_num = len(doc.text) / self.max_average_sentence_length
 617        if len(kuten_lst) < min_kuten_num:
 618            doc.is_rejected = True
 619        return doc
 620
 621
 622class HeaderFooterTagsRemover(Filter):
 623    """
 624    ドキュメントの冒頭・末尾のトークンを調査し, ヘッダー・フッダー的な
 625    タグが存在していた場合, そのトークンを除去します.
 626
 627    このフィルタを通す前に, 事前にセンテンスレベルにトーカナイズしておいてください.
 628    このフィルタでは Document.token にのみ変更が加えられるので, 出力前 あるいは 下流フィルタで
 629    Document.text に変更を加える前にトークンをマージしておいてください.
 630    """
 631
 632    def __init__(
 633        self,
 634        dict_path: Union[str, PathLike] = BASE_PATH / "dict/header_footer_keywords_ja.txt",
 635        *args: Any,
 636        **kwargs: Any,
 637    ) -> None:
 638        super().__init__(*args, **kwargs)
 639
 640        with open(dict_path) as fp:
 641            keywords = fp.read().split("\n")
 642        keywords = [re.escape(w.strip()) for w in keywords if not len(w) == 0]
 643        self.keyword_pat = re.compile(r"|".join(keywords))
 644
 645    def apply(self, doc: Document) -> Document:
 646        if len(doc.tokens) == 0:
 647            return doc
 648
 649        lookup_size = 0
 650        if 1 <= len(doc.tokens) < 4:
 651            lookup_size = 1
 652        elif 4 <= len(doc.tokens) < 6:
 653            lookup_size = 2
 654        elif 6 <= len(doc.tokens):
 655            lookup_size = 3
 656
 657        for i in range(lookup_size):
 658            if self.should_drop_token(doc.tokens[i]):
 659                doc.tokens[i].is_rejected = True
 660            if self.should_drop_token(doc.tokens[-(i + 1)]):
 661                doc.tokens[i].is_rejected = True
 662
 663        return doc
 664
 665    def should_drop_token(self, token: Token) -> bool:
 666        """
 667        >>> HeaderFooterTagsRemover().should_drop_token(Token("<TEST_STRING_OF_KEYWORD>"))
 668        True
 669
 670        >>> HeaderFooterTagsRemover().should_drop_token(Token("ほうじ茶"))
 671        False
 672
 673        Comment.
 674        Original legacy code removed a pattern r"« _ | Main | _ »" .
 675        In the pattern, "|" is not escaped, so **ANY** string was eliminated.
 676        It seems unintended behavior, so I fix this.
 677        """
 678        if self.keyword_pat.match(token.text):
 679            return True
 680        else:
 681            return False
 682
 683
 684class MaskPersonalInformation(Filter):
 685    """
 686    ドキュメントに含まれる電話番号・電子メールアドレスを一部マスキングします.
 687    """
 688
 689    def __init__(self, *args: Any, **kwargs: Any) -> None:
 690        super().__init__(*args, **kwargs)
 691
 692        self.phone_pat = re.compile(
 693            r"((0|\+\d{1,3}[- ]?)(\d{2}[- ]?\d{4}[- ]?|\d[- ]?\d{4}[- ]?|\d{2}[- ]?\d{3}[- ]?|\d{3}[- ]?\d{2}[- ]?|\d{4}[- ]?\d{1}[- ]?))\d{4}"  # noqa
 694        )
 695        self.email_pat = re.compile(
 696            r"[a-zA-Z0-9!#$%&'*+\-/=?^_`{|}~.]+@[A-Za-z0-9!#$%&'*+\-/=?^_`{|}~.]+(\.[A-Za-z0-9\-]+)"  # noqa
 697        )
 698
 699    def apply(self, doc: Document) -> Document:
 700        """
 701        >>> MaskPersonalInformation()('06-1234-5678')
 702        '06-1234-XXXX'
 703        >>> MaskPersonalInformation()('075-123-4567')
 704        '075-123-XXXX'
 705        >>> MaskPersonalInformation()('0166-12-3456')
 706        '0166-12-XXXX'
 707        >>> MaskPersonalInformation()('09808-1-2345')
 708        '09808-1-XXXX'
 709        >>> MaskPersonalInformation()('090-1234-5678')
 710        '090-1234-XXXX'
 711        >>> MaskPersonalInformation()('0751234567')
 712        '075123XXXX'
 713        >>> MaskPersonalInformation()('08012345678')
 714        '0801234XXXX'
 715        >>> MaskPersonalInformation()('連絡は075-123-4567 まで')
 716        '連絡は075-123-XXXX まで'
 717        >>> MaskPersonalInformation()('+81-80-1234-5678')
 718        '+81-80-1234-XXXX'
 719        >>> MaskPersonalInformation()('+818012345678')
 720        '+81801234XXXX'
 721        >>> MaskPersonalInformation()('hogehoge@example.com')
 722        'xxxx@yyy.com'
 723        >>> MaskPersonalInformation()('何かあれば hogehoge@example.ne.jp まで連絡')
 724        '何かあれば xxxx@yyy.jp まで連絡'
 725        """
 726        text = self.phone_pat.sub(r"\1XXXX", doc.text)
 727        text = self.email_pat.sub(r"xxxx@yyy\1", text)
 728        doc.text = text
 729        return doc
 730
 731
 732class DiscardTooManyNouns(Filter):
 733    """
 734    [!CAUTION] This filter requires `fugashi` package. Please install it
 735    by `pip install 'hojichar[all]'`.
 736
 737    A filter that removes document with too many nouns in Japanese i.e.,
 738    documents such as advertisement, word salad, etc ...
 739    """
 740
 741    def __init__(
 742        self, threshold: float = 0.80, max_parse_chars: int = 100_000, *args: Any, **kwargs: Any
 743    ) -> None:
 744        """
 745        Args:
 746            threshold: document whose noun ratio is higher than this value will be discarded
 747            max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document.
 748            *args:
 749            **kwargs:
 750        """
 751        super().__init__(*args, **kwargs)
 752        assert is_loaded_extras, (
 753            "fugashi is required for this filter. Try pip install 'hojichar[all]'"
 754        )
 755
 756        self.threshold = threshold
 757        self.max_parse_chars = max_parse_chars
 758        self.tagger = Tagger("-Owakati")
 759        assert "unidic" in self.tagger.dictionary_info[0]["filename"], (
 760            "MeCab dictionary must be unidic"
 761        )
 762
 763    def _chunk_text(self, text: str) -> Iterable[str]:
 764        """Slice text into chunks of `max_parse_chars` length."""
 765        step = self.max_parse_chars
 766        for i in range(0, len(text), step):
 767            yield text[i : i + step]
 768
 769    def apply(self, doc: Document) -> Document:
 770        """
 771        >>> DiscardTooManyNouns().apply(Document("自然言語処理大好き!")).is_rejected
 772        False
 773        >>> DiscardTooManyNouns().apply(Document("リンゴ・オレンジ・ミカン・バナナ セール中")).is_rejected
 774        True
 775        >>> DiscardTooManyNouns().apply(Document("今日の仙台朝市ではリンゴがセール中")).is_rejected
 776        False
 777        """
 778        # remove "補助記号" from part-of-speech statistics
 779        # because they often decrease the noun ratio,
 780        # e.g., the sentence "リンゴ・オレンジ・バナナ・" has 補助記号 ratio of 0.5
 781        # however, we don't want such sentence
 782
 783        pos_count: Counter[str] = Counter()
 784        for chunk in self._chunk_text(doc.text):
 785            for word in self.tagger(chunk):
 786                if word.feature.pos1 != "補助記号":
 787                    pos_count[word.feature.pos1] += 1
 788
 789        try:
 790            noun_ratio = pos_count["名詞"] / sum(pos_count.values())
 791        except ZeroDivisionError:
 792            noun_ratio = 0.0
 793        if noun_ratio >= self.threshold:
 794            doc.is_rejected = True
 795        return doc
 796
 797
 798class CharRepetitionRatioFilter(Filter):
 799    """
 800    文字Ngramの重なり率(文書中で高頻度文字Ngramが占める割合)を計算して, 重なりの大きいものを除去します.
 801    名詞の連続からなるような広告テキストを取り除くのに有効です.
 802
 803    実装は, BigScience で採用されていた前処理を参考にしています.
 804    元実装: https://github.com/bigscience-workshop/data-preparation/blob/9d0588419073cc5bf0fb92b58f37f2a1016572c3/preprocessing/training/01b_oscar_cleaning_and_filtering/filtering.py#L425-L453  # noqa: E501
 805
 806    「高頻度文字Ngram」は、sqrt(ユニークなNgramの総数)によって求めていますが,
 807    これは文書長の影響を軽減するためだとされています.
 808
 809    掲示板のテキストが引っかかりやすい傾向があります.
 810    13: 名無しさん@実況で競馬板アウト 2019/08/18(日) 15:28:46.10 ID:eBvZg8h+0
 811    的なものが高頻度で登場するため、文字Ngramの重なり率も高くなってしまう
 812    """
 813
 814    def __init__(
 815        self, threshold: float = 0.33, ngram_size: int = 5, *args: Any, **kwargs: Any
 816    ) -> None:
 817        """
 818
 819        Args:
 820            threshold: document with character repetition ratio higher than this value will be discarded
 821            ngram_size: character ngram size. Larger value will decrease the false positive of long documents
 822            *args:
 823            **kwargs:
 824        """  # noqa: E501
 825
 826        super().__init__(*args, **kwargs)
 827        self.threshold = threshold
 828        self.ngram_size = ngram_size
 829
 830    def apply(self, doc: Document) -> Document:
 831        ratio = self.compute_character_repetition_ratio(doc.text, self.ngram_size)
 832        if ratio >= self.threshold:
 833            doc.is_rejected = True
 834        return doc
 835
 836    @staticmethod
 837    def compute_character_repetition_ratio(
 838        document: str, character_repetition_length: int
 839    ) -> float:
 840        def get_freq_character_ngrams(document: str, n: int) -> Dict[str, int]:
 841            character_ngrams: List[str] = [
 842                document[i : i + n] for i in range(len(document) - n + 1)
 843            ]
 844            freq_character_ngrams_dict: Dict[str, int] = {}
 845            for character_ngram in character_ngrams:
 846                freq_character_ngrams_dict[character_ngram] = (
 847                    freq_character_ngrams_dict.get(character_ngram, 0) + 1
 848                )
 849            return freq_character_ngrams_dict
 850
 851        freq_character_ngrams_dict = get_freq_character_ngrams(
 852            document, character_repetition_length
 853        )
 854        if len(freq_character_ngrams_dict) == 0:
 855            return 0.0
 856        freq_character_ngrams: List[int] = list(freq_character_ngrams_dict.values())
 857        freq_character_ngrams = sorted(freq_character_ngrams, reverse=True)
 858        val_one = len([el for el in freq_character_ngrams if el == 1])
 859        num_rep_character_ngrams = min(
 860            int(np.sqrt(len(freq_character_ngrams))),
 861            len(freq_character_ngrams) - val_one,
 862        )
 863        character_repetition_ratio = sum(freq_character_ngrams[:num_rep_character_ngrams]) / sum(
 864            freq_character_ngrams
 865        )
 866        return character_repetition_ratio
 867
 868
 869class WordRepetitionRatioFilter(Filter):
 870    """
 871    [!CAUTION] This filter requires `fugashi` package. Please install it
 872    by `pip install 'hojichar[all]'`.
 873
 874    単語Ngramの重なり率(文書中で重複する単語Ngramが占める割合)を計算して、重なりの大きいものを弾くためのフィルタ.
 875    BigScienceで採用されていた前処理を参考にしている.
 876
 877    名詞が連打されているような広告テキストを取り除くのに有効な様子
 878    まともな文書がたまたま2回繰り返されている場合もあり、これを取り除いて良いのかは分からない
 879    例:
 880    "ウェブ\n本文: ニコンの上昇率16%超える、今3月期は経常76%の大幅増益見込む(ニコン) 2013年05月10日[minkabu PRESS] - みんなの株式 (みんかぶ)\n2013/05/10(10:57)
 881    ニコン<7731.T>が急騰、寄り付き直後に前日比355円高の2537円まで買い上げ
 882    られ、上昇率は16%を超えた。外国為替市場で円が1ドル100円台、1ユーロ131円台に入るなど急速に円安が進み、輸出株が軒並み高になる
 883    なか、9日取引終了後に発表した前年3月期決算で、今3月期は2ケタ近い増収で大幅増益を見込んだことが買い気を強めさせた。連結売上
 884    高は前期比9.8%増の1兆1100億円、経常利益75.8%増の850億円を予想。前期は半導体、電子部品の低迷が足かせになり、2ケタ増収ながら
 885    経常46%の大幅減益になったが、レンズ交換式デジタルカメラの拡大や液晶ディスプレイの回復で収益が急回復する。ニコンの株価は10時
 886    56分現在2491円(△309円)出所:株経通信(株式会社みんかぶ)\n2013/05/10 - ニコン(7731) の関連ニュース。 ニコン<7731.T>が急騰、寄
 887    り付き直後に前日比355円高の2537円まで買い上げられ、上昇率は16%を超えた。外国為替市場で円が1ドル100円台、1ユーロ131円台に入
 888    るなど急速に円安が進み、輸出株が軒並み高になるなか、9日取引終了後に発表した前年3月期決算で、今3月期は2ケタ近い増収で大幅増
 889    益を見込んだことが買い気を強めさせた。連結売上高は前期比9.8%増の1兆1100億円、経常利益75.8%増の850億円を予想。前期は半導体、
 890    電子部品の低迷が足かせになり、2ケタ増収ながら経常46%の大幅減益になったが、レンズ交換式デジタルカメラの拡大や液晶ディスプレ
 891    イの回復で収益が急回"
 892    """  # noqa: E501
 893
 894    def __init__(
 895        self,
 896        threshold: float = 0.40,
 897        ngram_size: int = 7,
 898        max_parse_chars: int = 100_000,
 899        *args: Any,
 900        **kwargs: Any,
 901    ) -> None:
 902        """
 903
 904        Args:
 905            threshold: document whose character repetition ratio is higher than this value will be discarded
 906            ngram_size: character ngram size. Larger value will decrease the false positive of long documents
 907            max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document.
 908            *args:
 909            **kwargs:
 910        """  # noqa: E501
 911        super().__init__(*args, **kwargs)
 912        assert is_loaded_extras, (
 913            "fugashi is required for this filter. Try pip install 'hojichar[all]'"
 914        )
 915
 916        self.threshold = threshold
 917        self.ngram_size = ngram_size
 918        self.max_parse_chars = max_parse_chars
 919        self.tagger = Tagger("-Owakati")
 920
 921    def _chunk_text(self, text: str) -> Iterable[str]:
 922        """Split text into chunks of `max_parse_chars` length."""
 923        step = self.max_parse_chars
 924        for i in range(0, len(text), step):
 925            yield text[i : i + step]
 926
 927    def _get_freq_word_ngrams(self, words: List[str], n: int) -> Dict[str, int]:
 928        freq: Dict[str, int] = {}
 929        if n <= 0 or len(words) < n:
 930            return freq
 931        for i in range(len(words) - n + 1):
 932            key = " ".join(words[i : i + n])
 933            freq[key] = freq.get(key, 0) + 1
 934        return freq
 935
 936    def apply(self, doc: Document) -> Document:
 937        ratio = self.compute_word_repetition_ratio(doc.text, self.ngram_size)
 938        if ratio >= self.threshold:
 939            doc.is_rejected = True
 940        return doc
 941
 942    def compute_word_repetition_ratio(self, document: str, n: int) -> float:
 943        total_counter: Counter[str] = Counter()
 944
 945        for chunk in self._chunk_text(document):
 946            words = [w.surface for w in self.tagger(chunk)]
 947            total_counter.update(self._get_freq_word_ngrams(words, n))
 948
 949        if not total_counter:
 950            return 0.0
 951
 952        total = sum(total_counter.values())
 953        repeated = sum(v for v in total_counter.values() if v > 1)
 954        return repeated / total
 955
 956
 957class DiscardTooManySpecialToken(Filter):
 958    """
 959    [!CAUTION] This filter requires `emoji` package. Please install it
 960    by `pip install 'hojichar[all]'`.
 961
 962    句読点を含む記号、空白、絵文字、その他特殊な文字を一定の割合以上含むような文書を取り除くためのフィルタ
 963    元実装: BigScience https://github.com/bigscience-workshop/data-preparation/blob/9d0588419073cc5bf0fb92b58f37f2a1016572c3/preprocessing/training/01b_oscar_cleaning_and_filtering/parameters_filtering.py#L5-L16  # noqa: E501
 964    """
 965
 966    def __init__(self, threshold: float = 0.4, *args: Any, **kwargs: Any) -> None:
 967        """
 968
 969        Args:
 970            threshold: document whose special token ratio is higher than this value will be discarded
 971            *args:
 972            **kwargs:
 973        """  # noqa: E501
 974        super().__init__(*args, **kwargs)
 975
 976        # digits are not regarded as special tokens
 977        # otherwise many false positives are made, i.e., good documents discarded
 978        main_special_characters = string.punctuation + string.whitespace  # + string.digits
 979        other_special_characters = (
 980            "’ “— ™ – •‘œ    ˜ ‚ƒ„’“”–▬…✦�­£​•€«»°·═"
 981            "×士^˘⇓()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰……‑≤≥‖"
 982            "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†:⁄♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚"
 983            "゜ʼ≖ʼ¤℃√!?【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖"
 984            "」﴾》�"
 985        )
 986
 987        en_emoji = emoji.EMOJI_DATA.keys()
 988
 989        special_characters_default = set(main_special_characters + other_special_characters)
 990        special_characters_default.update(en_emoji)
 991        self.special_characters = special_characters_default
 992
 993        self.threshold = threshold
 994
 995    def _compute_special_characters_ratio(self, text: str) -> float:
 996        if len(text) == 0:
 997            return 0
 998
 999        special_characters_ratio = len(
1000            [char for char in text if char in self.special_characters]
1001        ) / len(text)
1002        return special_characters_ratio
1003
1004    def apply(self, doc: Document) -> Document:
1005        special_characters_ratio = self._compute_special_characters_ratio(doc.text)
1006
1007        if special_characters_ratio > self.threshold:
1008            doc.is_rejected = True
1009        return doc
1010
1011
1012class SingleCharacterRepetitionFilter(Filter):
1013    """
1014    単一文字が大量に繰り返されているような文書を取り除くためのフィルタ
1015    そのような文書はノイズである可能性が高いため
1016    参考: BigScienceプロジェクトによると、oscarデータセットの中にバックスラッシュだけを2M個含むような文書が含まれていたらしい
1017    https://github.com/bigscience-workshop/bigscience/blob/master/train/tr8-104B-wide/chronicles.md#2m-backslash-only-samples-in-our-dataset  # noqa: E501
1018    """
1019
1020    def __init__(
1021        self,
1022        threshold: int = 200,
1023        *args: Any,
1024        **kwargs: Any,
1025    ) -> None:
1026        """
1027        Args:
1028            threshold: The document is removed if character is repeated for this value or more
1029            *args:
1030            **kwargs:
1031        """
1032        super().__init__(*args, **kwargs)
1033        self.threshold = threshold
1034
1035    def _is_repeat_contained(self, text: str) -> bool:
1036        groups = groupby(text)
1037        is_repeat_contained = any(sum(1 for _ in group) >= self.threshold for _, group in groups)
1038        return is_repeat_contained
1039
1040    def apply(self, doc: Document) -> Document:
1041        if self._is_repeat_contained(doc.text):
1042            doc.is_rejected = True
1043        return doc
1044
1045
1046class DiscardTooManyEndingEllipsis(Filter):
1047    """
1048    ellipsisで終わるような行が大量に含まれるような文書を取り除くためのフィルタです.
1049    ellipsisとしては ... と … を用いている
1050    同様のフィルタが RedPajama v2で用いられています.
1051
1052    例として, 以下のような文書を検知します.
1053    ```
1054    ペアーズは女性、という驚愕の過食が出ているのをごアラサーですか。時代から付...
1055    バツイチアラフォー 婚活ち女性の特徴と子持な付...
1056    ```
1057
1058    デフォルトではしきい値を0.7としているが, これはC4から0.1%を削るような設定であり、
1059    precisionを重視した設定です.
1060    """
1061
1062    def __init__(
1063        self,
1064        threshold: float = 0.7,
1065        *args: Any,
1066        **kwargs: Any,
1067    ) -> None:
1068        """
1069        Args:
1070            threshold: The document is removed if ratio of lines ending with ellipsis is higher than this value
1071            *args:
1072            **kwargs:
1073        """  # noqa: E501
1074        super().__init__(*args, **kwargs)
1075        self.threshold = threshold
1076        self.ellipsis_pattern = re.compile(r"(\.{3}|…)\n")  # matches ...\n and …\n
1077
1078    def apply(self, doc: Document) -> Document:
1079        ellipsis_count = len(self.ellipsis_pattern.findall(doc.text))
1080        newline_count = max(doc.text.count("\n"), 1)  # avoid zero division
1081        ellipsis_ratio = ellipsis_count / newline_count
1082
1083        if ellipsis_ratio > self.threshold:
1084            doc.is_rejected = True
1085        return doc
1086
1087
1088class DiscardTooShortLines(Filter):
1089    """
1090    短い行を大量に含む文書を捨てるためのフィルタです.
1091
1092    メニューバーやパンくずリストのような要素を大量に含む文書を取り除くのに有効です.
1093    """
1094
1095    def __init__(self, threshold: float = 0.5, *args: Any, **kwargs: Any) -> None:
1096        """
1097        Args:
1098            threshold: The document is removed if the ratio of short (<10 chars) lines are more than this value.
1099            *args:
1100            **kwargs:
1101        """  # noqa: E501
1102        super().__init__(*args, **kwargs)
1103        self.threshold = threshold
1104        # この値は適当に決め打ち
1105        self.minimum_line_length = 10
1106
1107    def apply(self, doc: Document) -> Document:
1108        lines = [len(x) for x in doc.text.split("\n")]
1109        short_lines = [x for x in lines if x <= self.minimum_line_length]
1110        if (len(short_lines) / len(lines)) > self.threshold:
1111            doc.is_rejected = True
1112        return doc
class ExampleHojiChar(hojichar.core.filter_interface.Filter):
32class ExampleHojiChar(Filter):
33    """基本的なフィルタの実装例です. 末尾に'<hojichar>'を追加します."""
34
35    def apply(self, document: Document) -> Document:
36        """
37        >>> ExampleHojiChar()("hello, world")
38        'hello, world<hojichar>'
39        """
40        document.text += "<hojichar>"
41        return document

基本的なフィルタの実装例です. 末尾に''を追加します.

def apply( self, document: hojichar.core.models.Document) -> hojichar.core.models.Document:
35    def apply(self, document: Document) -> Document:
36        """
37        >>> ExampleHojiChar()("hello, world")
38        'hello, world<hojichar>'
39        """
40        document.text += "<hojichar>"
41        return document
>>> ExampleHojiChar()("hello, world")
'hello, world<hojichar>'
class ExampleDiscardDocumentContainKeyword(hojichar.core.filter_interface.Filter):
44class ExampleDiscardDocumentContainKeyword(Filter):
45    """特定のキーワードを持つドキュメントを破棄するようなフィルタの実装例です."""
46
47    def __init__(self, keyword: str, *args: Any, **kwargs: Any) -> None:
48        super().__init__(*args, **kwargs)
49        self.keyword = keyword
50
51    def apply(self, document: Document) -> Document:
52        """
53        >>> ExampleDiscardDocumentContainKeyword("バカ").apply(Document("あいつはバカだ")).is_rejected
54        True
55        """
56        if self.keyword in document.text:
57            document.is_rejected = True
58        return document

特定のキーワードを持つドキュメントを破棄するようなフィルタの実装例です.

ExampleDiscardDocumentContainKeyword(keyword: str, *args: Any, **kwargs: Any)
47    def __init__(self, keyword: str, *args: Any, **kwargs: Any) -> None:
48        super().__init__(*args, **kwargs)
49        self.keyword = keyword

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, document: hojichar.core.models.Document) -> hojichar.core.models.Document:
51    def apply(self, document: Document) -> Document:
52        """
53        >>> ExampleDiscardDocumentContainKeyword("バカ").apply(Document("あいつはバカだ")).is_rejected
54        True
55        """
56        if self.keyword in document.text:
57            document.is_rejected = True
58        return document
>>> ExampleDiscardDocumentContainKeyword("バカ").apply(Document("あいつはバカだ")).is_rejected
True
class Identity(hojichar.core.filter_interface.Filter):
61class Identity(Filter):
62    """何も変化を加えないフィルタです. テスト・デバッグに用いられます."""
63
64    def apply(self, document: Document) -> Document:
65        return document

何も変化を加えないフィルタです. テスト・デバッグに用いられます.

def apply( self, document: hojichar.core.models.Document) -> hojichar.core.models.Document:
64    def apply(self, document: Document) -> Document:
65        return document

Definition of filter behavior.

The document must have a protocol TextContent, and mostly used hojichar.Document class.

In this method, the filter will modify document.text or document.extras and set document.is_rejected = True to discard the document.

Parameters

document : Document Input document

Returns

Document Processed Document

class DiscardAll(hojichar.core.filter_interface.Filter):
68class DiscardAll(Filter):
69    """
70    すべてのドキュメントを破棄するフィルタです.
71    テスト・デバッグに用いられます.
72    """
73
74    def apply(self, document: Document) -> Document:
75        document.is_rejected = True
76        return document

すべてのドキュメントを破棄するフィルタです. テスト・デバッグに用いられます.

def apply( self, document: hojichar.core.models.Document) -> hojichar.core.models.Document:
74    def apply(self, document: Document) -> Document:
75        document.is_rejected = True
76        return document

Definition of filter behavior.

The document must have a protocol TextContent, and mostly used hojichar.Document class.

In this method, the filter will modify document.text or document.extras and set document.is_rejected = True to discard the document.

Parameters

document : Document Input document

Returns

Document Processed Document

class ApplyDiscard(hojichar.core.filter_interface.Filter):
 79class ApplyDiscard(Filter):
 80    """
 81    上流フィルタで破棄された`Document`を空文字列にします.
 82
 83    `Document.is_rejected=True` の ドキュメントは無視されるため,
 84    このフィルタを `Compose` のコンストラクタに渡しても動作しません.
 85    このフィルタは主に`Compose` 内部や, `discard_filtered=False` を指定
 86    したデバッグ時などに利用されます.
 87    """
 88
 89    def __init__(self, *args: Any, **kwargs: Any) -> None:
 90        super().__init__(*args, **kwargs)
 91
 92    def apply(self, document: Document) -> Document:
 93        """
 94        >>> ApplyDiscard().apply(Document(text="hello", is_rejected=True)).text
 95        ''
 96        """
 97        if document.is_rejected:
 98            document.text = ""
 99
100        return document

上流フィルタで破棄されたDocumentを空文字列にします.

Document.is_rejected=True の ドキュメントは無視されるため, このフィルタを Compose のコンストラクタに渡しても動作しません. このフィルタは主にCompose 内部や, discard_filtered=False を指定 したデバッグ時などに利用されます.

ApplyDiscard(*args: Any, **kwargs: Any)
89    def __init__(self, *args: Any, **kwargs: Any) -> None:
90        super().__init__(*args, **kwargs)

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, document: hojichar.core.models.Document) -> hojichar.core.models.Document:
 92    def apply(self, document: Document) -> Document:
 93        """
 94        >>> ApplyDiscard().apply(Document(text="hello", is_rejected=True)).text
 95        ''
 96        """
 97        if document.is_rejected:
 98            document.text = ""
 99
100        return document
>>> ApplyDiscard().apply(Document(text="hello", is_rejected=True)).text
''
class Sleep(hojichar.core.filter_interface.Filter):
103class Sleep(Filter):
104    """
105    デバッグ用のフィルタです. 指定秒スリープします.
106    """
107
108    def __init__(self, time: float = 1.0, *args: Any, **kwargs: Any) -> None:
109        super().__init__(*args, **kwargs)
110        self.time = time
111
112    def apply(self, document: Document) -> Document:
113        """
114        >>> Sleep(0.1)('hello')  # After 0.1 seconds,
115        'hello'
116        """
117        time.sleep(self.time)
118        return document

デバッグ用のフィルタです. 指定秒スリープします.

Sleep(time: float = 1.0, *args: Any, **kwargs: Any)
108    def __init__(self, time: float = 1.0, *args: Any, **kwargs: Any) -> None:
109        super().__init__(*args, **kwargs)
110        self.time = time

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, document: hojichar.core.models.Document) -> hojichar.core.models.Document:
112    def apply(self, document: Document) -> Document:
113        """
114        >>> Sleep(0.1)('hello')  # After 0.1 seconds,
115        'hello'
116        """
117        time.sleep(self.time)
118        return document
>>> Sleep(0.1)('hello')  # After 0.1 seconds,
'hello'
class DocumentNormalizer(hojichar.core.filter_interface.Filter):
121class DocumentNormalizer(Filter):
122    """
123    Unicode の正規化をします.
124    """
125
126    def __init__(self, *args: Any, **kwargs: Any) -> None:
127        super().__init__(*args, **kwargs)
128
129    def apply(self, document: Document) -> Document:
130        document.text = unicodedata.normalize("NFKC", document.text)
131        return document

Unicode の正規化をします.

DocumentNormalizer(*args: Any, **kwargs: Any)
126    def __init__(self, *args: Any, **kwargs: Any) -> None:
127        super().__init__(*args, **kwargs)

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, document: hojichar.core.models.Document) -> hojichar.core.models.Document:
129    def apply(self, document: Document) -> Document:
130        document.text = unicodedata.normalize("NFKC", document.text)
131        return document

Definition of filter behavior.

The document must have a protocol TextContent, and mostly used hojichar.Document class.

In this method, the filter will modify document.text or document.extras and set document.is_rejected = True to discard the document.

Parameters

document : Document Input document

Returns

Document Processed Document

class JSONLoader(hojichar.core.filter_interface.Filter):
134class JSONLoader(Filter):
135    """
136    テキストを Json として解釈し, `key` で指定した要素を文字列として
137    doument に格納します.デフォルトの `key` は 'text' です.
138
139    Json の読み込み, あるいは `key` の読み込みに失敗した際には例外を送出します.
140    これらを無視する場合は, `ignore=True` にします. その際, 読み込みに失敗
141    したドキュメントは破棄されます.
142
143    入力 Json に `extras` キー(辞書形式)が含まれている場合, Document.extras に自動的にマージされます。
144    さらに `extra_keys` でフィールドを指定すると, それらの値も Document.extras に追記され, 既存の extras
145    を上書きせずに統合できます。
146    """
147
148    def __init__(
149        self,
150        key: str = "text",
151        ignore: bool = False,
152        extra_keys: Optional[List[str]] = None,
153        *args: Any,
154        **kwargs: Any,
155    ) -> None:
156        super().__init__(*args, **kwargs)
157        self.key = key
158        self.ignore = ignore
159        self.extra_keys = extra_keys
160
161    def apply(self, document: Document) -> Document:
162        """
163        >>> JSONLoader()( '{"text": "hello, world", "words": 2}' )
164        'hello, world'
165
166        >>> JSONLoader()( '{"text": hello, world ....' ) # Broken JSON
167        Traceback (most recent call last):
168            ...
169        json.decoder.JSONDecodeError: Expecting value: line 1 column 10 (char 9)
170
171        >>> JSONLoader()( '{"words": 2}' )
172        Traceback (most recent call last):
173            ...
174        KeyError: 'text'
175
176        >>> JSONLoader(ignore=True).apply(Document('{"text": hello, world ....' )).is_rejected
177        True
178        """
179        try:
180            data = json.loads(document.text)
181            document.text = str(data[self.key])
182            if "extras" in data and isinstance(data["extras"], dict):
183                document.extras.update(data["extras"])
184            if self.extra_keys is not None:
185                for key in self.extra_keys:
186                    if key not in data:
187                        continue
188                    if key == "extras" and isinstance(data[key], dict):
189                        document.extras.update(data[key])
190                    else:
191                        document.extras[key] = data[key]
192        except Exception as e:
193            logger.error(f"Failed to parsing in JSONLoader. Input document: \n{document.text}")
194            if self.ignore:
195                document.is_rejected = True
196                return document
197            else:
198                raise e
199
200        return document

テキストを Json として解釈し, key で指定した要素を文字列として doument に格納します.デフォルトの key は 'text' です.

Json の読み込み, あるいは key の読み込みに失敗した際には例外を送出します. これらを無視する場合は, ignore=True にします. その際, 読み込みに失敗 したドキュメントは破棄されます.

入力 Json に extras キー(辞書形式)が含まれている場合, Document.extras に自動的にマージされます。 さらに extra_keys でフィールドを指定すると, それらの値も Document.extras に追記され, 既存の extras を上書きせずに統合できます。

JSONLoader( key: str = 'text', ignore: bool = False, extra_keys: Optional[List[str]] = None, *args: Any, **kwargs: Any)
148    def __init__(
149        self,
150        key: str = "text",
151        ignore: bool = False,
152        extra_keys: Optional[List[str]] = None,
153        *args: Any,
154        **kwargs: Any,
155    ) -> None:
156        super().__init__(*args, **kwargs)
157        self.key = key
158        self.ignore = ignore
159        self.extra_keys = extra_keys

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, document: hojichar.core.models.Document) -> hojichar.core.models.Document:
161    def apply(self, document: Document) -> Document:
162        """
163        >>> JSONLoader()( '{"text": "hello, world", "words": 2}' )
164        'hello, world'
165
166        >>> JSONLoader()( '{"text": hello, world ....' ) # Broken JSON
167        Traceback (most recent call last):
168            ...
169        json.decoder.JSONDecodeError: Expecting value: line 1 column 10 (char 9)
170
171        >>> JSONLoader()( '{"words": 2}' )
172        Traceback (most recent call last):
173            ...
174        KeyError: 'text'
175
176        >>> JSONLoader(ignore=True).apply(Document('{"text": hello, world ....' )).is_rejected
177        True
178        """
179        try:
180            data = json.loads(document.text)
181            document.text = str(data[self.key])
182            if "extras" in data and isinstance(data["extras"], dict):
183                document.extras.update(data["extras"])
184            if self.extra_keys is not None:
185                for key in self.extra_keys:
186                    if key not in data:
187                        continue
188                    if key == "extras" and isinstance(data[key], dict):
189                        document.extras.update(data[key])
190                    else:
191                        document.extras[key] = data[key]
192        except Exception as e:
193            logger.error(f"Failed to parsing in JSONLoader. Input document: \n{document.text}")
194            if self.ignore:
195                document.is_rejected = True
196                return document
197            else:
198                raise e
199
200        return document
>>> JSONLoader()( '{"text": "hello, world", "words": 2}' )
'hello, world'
>>> JSONLoader()( '{"text": hello, world ....' ) # Broken JSON
Traceback (most recent call last):
    ...
json.decoder.JSONDecodeError: Expecting value: line 1 column 10 (char 9)
>>> JSONLoader()( '{"words": 2}' )
Traceback (most recent call last):
    ...
KeyError: 'text'
>>> JSONLoader(ignore=True).apply(Document('{"text": hello, world ....' )).is_rejected
True
class JSONDumper(hojichar.core.filter_interface.Filter):
203class JSONDumper(Filter):
204    """
205    Document.text の文字列を json に変換します.
206    必要に応じ Document のメタデータを付与します. これはドキュメントの破棄事由が含まれ、偽陽性の分析に有効です。
207    デフォルトで `skip_rejected` が `False` にセットされており、Document の破棄フラグにかかわらず
208    処理されます。
209    """
210
211    def __init__(
212        self,
213        dump_reason: bool = False,
214        p: float = 1,
215        skip_rejected: bool = False,
216        export_extras: bool = False,
217        *args: Any,
218        **kwargs: Any,
219    ) -> None:
220        """
221        Args:
222            dump_reason (bool, optional): `is_rejected`, `reason` エントリをダンプします. Defaults to False.
223            p (float, optional): Apply probability. Defaults to 1.
224            skip_rejected (bool, optional): 破棄済みサンプルを排除しません.
225        """
226        super().__init__(p, skip_rejected, *args, **kwargs)
227        self.dump_reason = dump_reason
228        self.export_extras = export_extras
229
230    def apply(self, document: Document) -> Document:
231        """
232        >>> JSONDumper()("hojichar")
233        '{"text": "hojichar"}'
234        """
235        text = document.text
236        if self.dump_reason:
237            if self.export_extras:
238                output_extras = dict(document.extras)
239                document.text = json.dumps(
240                    {
241                        "text": text,
242                        "is_rejected": document.is_rejected,
243                        "reason": document.reject_reason,
244                        "extras": output_extras,
245                    },
246                    ensure_ascii=False,
247                )
248            else:
249                document.text = json.dumps(
250                    {
251                        "text": text,
252                        "is_rejected": document.is_rejected,
253                        "reason": document.reject_reason,
254                    },
255                    ensure_ascii=False,
256                )
257        else:
258            if self.export_extras:
259                output_extras = dict(document.extras)
260                document.text = json.dumps(
261                    {
262                        "text": text,
263                        "extras": output_extras,
264                    },
265                    ensure_ascii=False,
266                )
267            else:
268                document.text = json.dumps({"text": text}, ensure_ascii=False)
269        return document

Document.text の文字列を json に変換します. 必要に応じ Document のメタデータを付与します. これはドキュメントの破棄事由が含まれ、偽陽性の分析に有効です。 デフォルトで skip_rejectedFalse にセットされており、Document の破棄フラグにかかわらず 処理されます。

JSONDumper( dump_reason: bool = False, p: float = 1, skip_rejected: bool = False, export_extras: bool = False, *args: Any, **kwargs: Any)
211    def __init__(
212        self,
213        dump_reason: bool = False,
214        p: float = 1,
215        skip_rejected: bool = False,
216        export_extras: bool = False,
217        *args: Any,
218        **kwargs: Any,
219    ) -> None:
220        """
221        Args:
222            dump_reason (bool, optional): `is_rejected`, `reason` エントリをダンプします. Defaults to False.
223            p (float, optional): Apply probability. Defaults to 1.
224            skip_rejected (bool, optional): 破棄済みサンプルを排除しません.
225        """
226        super().__init__(p, skip_rejected, *args, **kwargs)
227        self.dump_reason = dump_reason
228        self.export_extras = export_extras

Args: dump_reason (bool, optional): is_rejected, reason エントリをダンプします. Defaults to False. p (float, optional): Apply probability. Defaults to 1. skip_rejected (bool, optional): 破棄済みサンプルを排除しません.

def apply( self, document: hojichar.core.models.Document) -> hojichar.core.models.Document:
230    def apply(self, document: Document) -> Document:
231        """
232        >>> JSONDumper()("hojichar")
233        '{"text": "hojichar"}'
234        """
235        text = document.text
236        if self.dump_reason:
237            if self.export_extras:
238                output_extras = dict(document.extras)
239                document.text = json.dumps(
240                    {
241                        "text": text,
242                        "is_rejected": document.is_rejected,
243                        "reason": document.reject_reason,
244                        "extras": output_extras,
245                    },
246                    ensure_ascii=False,
247                )
248            else:
249                document.text = json.dumps(
250                    {
251                        "text": text,
252                        "is_rejected": document.is_rejected,
253                        "reason": document.reject_reason,
254                    },
255                    ensure_ascii=False,
256                )
257        else:
258            if self.export_extras:
259                output_extras = dict(document.extras)
260                document.text = json.dumps(
261                    {
262                        "text": text,
263                        "extras": output_extras,
264                    },
265                    ensure_ascii=False,
266                )
267            else:
268                document.text = json.dumps({"text": text}, ensure_ascii=False)
269        return document
>>> JSONDumper()("hojichar")
'{"text": "hojichar"}'
class DocumentLengthFilter(hojichar.core.filter_interface.Filter):
272class DocumentLengthFilter(Filter):
273    """
274    `min_doc_len`, `max_doc_len` で指定した上限・下限の範囲内にないドキュメントを破棄します.
275    デフォルトでは 200字 以上 50000字以内のテキストが受理されます.
276    """
277
278    def __init__(
279        self,
280        min_doc_len: Optional[int] = None,
281        max_doc_len: Optional[int] = None,
282        *args: Any,
283        **kwargs: Any,
284    ) -> None:
285        super().__init__(*args, **kwargs)
286
287        self.min_doc_len = min_doc_len
288        self.max_doc_len = max_doc_len
289
290    def apply(self, doc: Document) -> Document:
291        """
292        >>> DocumentLengthFilter(min_doc_len=5).apply(Document("1234")).is_rejected
293        True
294        """
295        doc_len = len(doc.text)
296        if self.min_doc_len is not None:
297            if doc_len < self.min_doc_len:
298                doc.is_rejected = True
299        if self.max_doc_len is not None:
300            if self.max_doc_len < doc_len:
301                doc.is_rejected = True
302        return doc

min_doc_len, max_doc_len で指定した上限・下限の範囲内にないドキュメントを破棄します. デフォルトでは 200字 以上 50000字以内のテキストが受理されます.

DocumentLengthFilter( min_doc_len: Optional[int] = None, max_doc_len: Optional[int] = None, *args: Any, **kwargs: Any)
278    def __init__(
279        self,
280        min_doc_len: Optional[int] = None,
281        max_doc_len: Optional[int] = None,
282        *args: Any,
283        **kwargs: Any,
284    ) -> None:
285        super().__init__(*args, **kwargs)
286
287        self.min_doc_len = min_doc_len
288        self.max_doc_len = max_doc_len

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
290    def apply(self, doc: Document) -> Document:
291        """
292        >>> DocumentLengthFilter(min_doc_len=5).apply(Document("1234")).is_rejected
293        True
294        """
295        doc_len = len(doc.text)
296        if self.min_doc_len is not None:
297            if doc_len < self.min_doc_len:
298                doc.is_rejected = True
299        if self.max_doc_len is not None:
300            if self.max_doc_len < doc_len:
301                doc.is_rejected = True
302        return doc
>>> DocumentLengthFilter(min_doc_len=5).apply(Document("1234")).is_rejected
True
class NgWordsFilterJa(hojichar.core.filter_interface.Filter):
305class NgWordsFilterJa(Filter):
306    """
307    日本語のNGワード(および不適切語)を含む文書を破棄します.
308    `dict_path` で指定したファイルから, キーワードのリストを得ます.
309    ファイルは単語が改行で羅列されたテキストファイルです.
310
311    `ignore_confused` を `True` にすると,
312    偽陽性を軽減するために, カタカナのNGワードは前後にカタカナが無い場合のみNG判定されます.
313    デフォルト値は `False` です.
314    """
315
316    def __init__(
317        self,
318        dict_path: Union[str, PathLike],
319        ignore_confused: bool = False,
320        *args: Any,
321        **kwargs: Any,
322    ) -> None:
323        super().__init__(*args, **kwargs)
324
325        with open(dict_path, encoding="utf-8") as fp:
326            ng_words = fp.read().split("\n")
327        ng_words = [w.strip() for w in ng_words if not len(w) == 0]
328
329        if ignore_confused:
330            words_katakana = []
331            words_not_katakana = []
332            for w in ng_words:
333                if re.fullmatch(r"[ァ-ヴー]+", w):
334                    words_katakana.append(re.escape(w))
335                else:
336                    words_not_katakana.append(re.escape(w))
337            katakana_pat = "|".join(words_katakana)
338            katakana_pat = rf"(?<![ァ-ヴー])({katakana_pat})(?![ァ-ヴー])"
339            pat = "|".join(words_not_katakana) + "|" + katakana_pat
340            self.keyword_pat = re.compile(pat)
341        else:
342            ng_words = [re.escape(w) for w in ng_words]
343            pat = "|".join(ng_words)
344            self.keyword_pat = re.compile(pat)
345
346    def apply(self, doc: Document) -> Document:
347        regex_match = self.keyword_pat.search(doc.text)
348        if regex_match:
349            doc.is_rejected = True
350            self.matched_text = regex_match.group()
351            self.matched_text_neighbor = doc.text[
352                regex_match.start() - 20 : regex_match.end() + 20
353            ]
354
355        return doc

日本語のNGワード(および不適切語)を含む文書を破棄します. dict_path で指定したファイルから, キーワードのリストを得ます. ファイルは単語が改行で羅列されたテキストファイルです.

ignore_confusedTrue にすると, 偽陽性を軽減するために, カタカナのNGワードは前後にカタカナが無い場合のみNG判定されます. デフォルト値は False です.

NgWordsFilterJa( dict_path: Union[str, os.PathLike], ignore_confused: bool = False, *args: Any, **kwargs: Any)
316    def __init__(
317        self,
318        dict_path: Union[str, PathLike],
319        ignore_confused: bool = False,
320        *args: Any,
321        **kwargs: Any,
322    ) -> None:
323        super().__init__(*args, **kwargs)
324
325        with open(dict_path, encoding="utf-8") as fp:
326            ng_words = fp.read().split("\n")
327        ng_words = [w.strip() for w in ng_words if not len(w) == 0]
328
329        if ignore_confused:
330            words_katakana = []
331            words_not_katakana = []
332            for w in ng_words:
333                if re.fullmatch(r"[ァ-ヴー]+", w):
334                    words_katakana.append(re.escape(w))
335                else:
336                    words_not_katakana.append(re.escape(w))
337            katakana_pat = "|".join(words_katakana)
338            katakana_pat = rf"(?<![ァ-ヴー])({katakana_pat})(?![ァ-ヴー])"
339            pat = "|".join(words_not_katakana) + "|" + katakana_pat
340            self.keyword_pat = re.compile(pat)
341        else:
342            ng_words = [re.escape(w) for w in ng_words]
343            pat = "|".join(ng_words)
344            self.keyword_pat = re.compile(pat)

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
346    def apply(self, doc: Document) -> Document:
347        regex_match = self.keyword_pat.search(doc.text)
348        if regex_match:
349            doc.is_rejected = True
350            self.matched_text = regex_match.group()
351            self.matched_text_neighbor = doc.text[
352                regex_match.start() - 20 : regex_match.end() + 20
353            ]
354
355        return doc

Definition of filter behavior.

The document must have a protocol TextContent, and mostly used hojichar.Document class.

In this method, the filter will modify document.text or document.extras and set document.is_rejected = True to discard the document.

Parameters

document : Document Input document

Returns

Document Processed Document

class NgWordsFilterEn(hojichar.core.filter_interface.Filter):
358class NgWordsFilterEn(Filter):
359    """
360    英語のNGワード(および不適切語)を含む文書を破棄します.
361    `dict_path` で指定したファイルから, キーワードのリストを得ます.
362    ファイルは単語が改行で羅列されたテキストファイルです.
363    """
364
365    def __init__(self, dict_path: Union[str, PathLike], *args: Any, **kwargs: Any) -> None:
366        super().__init__(*args, **kwargs)
367
368        with open(dict_path, encoding="utf-8") as fp:
369            ng_words = fp.read().split("\n")
370        ng_words = [re.escape(w.strip()) for w in ng_words if not len(w) == 0]
371        pat = "|".join(ng_words)
372        # 英語のパターンにマッチするようにしている, \s[単語]\s や [単語]. [単語], などにマッチ.
373        self.keyword_pat = re.compile(rf"(?:^| )({pat})(?:( |,|\.)|$)", re.IGNORECASE)
374
375    def apply(self, doc: Document) -> Document:
376        if self.keyword_pat.search(doc.text):
377            doc.is_rejected = True
378        return doc

英語のNGワード(および不適切語)を含む文書を破棄します. dict_path で指定したファイルから, キーワードのリストを得ます. ファイルは単語が改行で羅列されたテキストファイルです.

NgWordsFilterEn(dict_path: Union[str, os.PathLike], *args: Any, **kwargs: Any)
365    def __init__(self, dict_path: Union[str, PathLike], *args: Any, **kwargs: Any) -> None:
366        super().__init__(*args, **kwargs)
367
368        with open(dict_path, encoding="utf-8") as fp:
369            ng_words = fp.read().split("\n")
370        ng_words = [re.escape(w.strip()) for w in ng_words if not len(w) == 0]
371        pat = "|".join(ng_words)
372        # 英語のパターンにマッチするようにしている, \s[単語]\s や [単語]. [単語], などにマッチ.
373        self.keyword_pat = re.compile(rf"(?:^| )({pat})(?:( |,|\.)|$)", re.IGNORECASE)

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
375    def apply(self, doc: Document) -> Document:
376        if self.keyword_pat.search(doc.text):
377            doc.is_rejected = True
378        return doc

Definition of filter behavior.

The document must have a protocol TextContent, and mostly used hojichar.Document class.

In this method, the filter will modify document.text or document.extras and set document.is_rejected = True to discard the document.

Parameters

document : Document Input document

Returns

Document Processed Document

class DiscardAdultContentJa(NgWordsFilterJa):
381class DiscardAdultContentJa(NgWordsFilterJa):
382    """
383    日本語のアダルトキーワード(および不適切語)を含む文書を破棄します.
384    `dict_path` で指定したファイルから, キーワードのリストを得ます.
385    ファイルは単語が改行で羅列されたテキストファイルです.
386    デフォルトの`dict_path` は /hojichar/dict/adult_keywords_ja.txt です.
387    """
388
389    def __init__(
390        self,
391        dict_path: Union[str, PathLike] = BASE_PATH / "dict/adult_keywords_ja.txt",
392        *args: Any,
393        **kwargs: Any,
394    ) -> None:
395        super().__init__(dict_path, *args, **kwargs)
396
397    def apply(self, doc: Document) -> Document:
398        """
399        >>> DiscardAdultContentJa().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected
400        True
401
402        >>> DiscardAdultContentJa().apply(Document("ほうじ茶")).is_rejected
403        False
404
405        挙動は正しいが誤検知しているケース. 他にも, サック in リュックサック,
406        >>> DiscardAdultContentJa().apply(Document("アスパラガス")).is_rejected \
407        # Matching with NG keyword "アス"
408        True
409        """
410        return super().apply(doc)

日本語のアダルトキーワード(および不適切語)を含む文書を破棄します. dict_path で指定したファイルから, キーワードのリストを得ます. ファイルは単語が改行で羅列されたテキストファイルです. デフォルトのdict_path は /hojichar/dict/adult_keywords_ja.txt です.

DiscardAdultContentJa( dict_path: Union[str, os.PathLike] = PosixPath('/home/runner/work/HojiChar/HojiChar/hojichar/dict/adult_keywords_ja.txt'), *args: Any, **kwargs: Any)
389    def __init__(
390        self,
391        dict_path: Union[str, PathLike] = BASE_PATH / "dict/adult_keywords_ja.txt",
392        *args: Any,
393        **kwargs: Any,
394    ) -> None:
395        super().__init__(dict_path, *args, **kwargs)

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
397    def apply(self, doc: Document) -> Document:
398        """
399        >>> DiscardAdultContentJa().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected
400        True
401
402        >>> DiscardAdultContentJa().apply(Document("ほうじ茶")).is_rejected
403        False
404
405        挙動は正しいが誤検知しているケース. 他にも, サック in リュックサック,
406        >>> DiscardAdultContentJa().apply(Document("アスパラガス")).is_rejected \
407        # Matching with NG keyword "アス"
408        True
409        """
410        return super().apply(doc)
>>> DiscardAdultContentJa().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected
True
>>> DiscardAdultContentJa().apply(Document("ほうじ茶")).is_rejected
False

挙動は正しいが誤検知しているケース. 他にも, サック in リュックサック,

>>> DiscardAdultContentJa().apply(Document("アスパラガス")).is_rejected         # Matching with NG keyword "アス"
True
class DiscardAdultContentEn(NgWordsFilterEn):
413class DiscardAdultContentEn(NgWordsFilterEn):
414    """
415    英語のアダルトキーワード(および不適切語)を含む文書を破棄します.
416    `dict_path` で指定したファイルから, キーワードのリストを得ます.
417    ファイルは単語が改行で羅列されたテキストファイルです.
418    デフォルトの`dict_path` は /hojichar/dict/adult_keywords_en.txt です.
419    """
420
421    def __init__(
422        self,
423        dict_path: Union[str, PathLike] = BASE_PATH / "dict/adult_keywords_en.txt",
424        *args: Any,
425        **kwargs: Any,
426    ) -> None:
427        super().__init__(dict_path, *args, **kwargs)
428
429    def apply(self, doc: Document) -> Document:
430        """
431        >>> DiscardAdultContentEn().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected
432        True
433
434        >>> DiscardAdultContentEn().apply(Document("hojichar")).is_rejected
435        False
436        """
437        return super().apply(doc)

英語のアダルトキーワード(および不適切語)を含む文書を破棄します. dict_path で指定したファイルから, キーワードのリストを得ます. ファイルは単語が改行で羅列されたテキストファイルです. デフォルトのdict_path は /hojichar/dict/adult_keywords_en.txt です.

DiscardAdultContentEn( dict_path: Union[str, os.PathLike] = PosixPath('/home/runner/work/HojiChar/HojiChar/hojichar/dict/adult_keywords_en.txt'), *args: Any, **kwargs: Any)
421    def __init__(
422        self,
423        dict_path: Union[str, PathLike] = BASE_PATH / "dict/adult_keywords_en.txt",
424        *args: Any,
425        **kwargs: Any,
426    ) -> None:
427        super().__init__(dict_path, *args, **kwargs)

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
429    def apply(self, doc: Document) -> Document:
430        """
431        >>> DiscardAdultContentEn().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected
432        True
433
434        >>> DiscardAdultContentEn().apply(Document("hojichar")).is_rejected
435        False
436        """
437        return super().apply(doc)
>>> DiscardAdultContentEn().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected
True
>>> DiscardAdultContentEn().apply(Document("hojichar")).is_rejected
False
class DiscardDiscriminationContentJa(NgWordsFilterJa):
440class DiscardDiscriminationContentJa(NgWordsFilterJa):
441    """
442    日本語の差別キーワード(および不適切語)を含む文書を破棄します.
443    `dict_path` で指定したファイルから, キーワードのリストを得ます.
444    ファイルは単語が改行で羅列されたテキストファイルです.
445    デフォルトの`dict_path` は /hojichar/dict/discrimination_keywords_ja.txt です.
446    """
447
448    def __init__(
449        self,
450        dict_path: Union[str, PathLike] = BASE_PATH / "dict/discrimination_keywords_ja.txt",
451        *args: Any,
452        **kwargs: Any,
453    ):
454        super().__init__(dict_path, *args, **kwargs)
455
456    def apply(self, doc: Document) -> Document:
457        """
458        >>> DiscardDiscriminationContentJa().\
459            apply(Document("<TEST_STRING_OF_DISCRIMINATION_KEYWORD>")).is_rejected
460        True
461
462        >>> DiscardDiscriminationContentJa().apply(Document("ほうじ茶")).is_rejected
463        False
464        """
465        return super().apply(doc)

日本語の差別キーワード(および不適切語)を含む文書を破棄します. dict_path で指定したファイルから, キーワードのリストを得ます. ファイルは単語が改行で羅列されたテキストファイルです. デフォルトのdict_path は /hojichar/dict/discrimination_keywords_ja.txt です.

DiscardDiscriminationContentJa( dict_path: Union[str, os.PathLike] = PosixPath('/home/runner/work/HojiChar/HojiChar/hojichar/dict/discrimination_keywords_ja.txt'), *args: Any, **kwargs: Any)
448    def __init__(
449        self,
450        dict_path: Union[str, PathLike] = BASE_PATH / "dict/discrimination_keywords_ja.txt",
451        *args: Any,
452        **kwargs: Any,
453    ):
454        super().__init__(dict_path, *args, **kwargs)

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
456    def apply(self, doc: Document) -> Document:
457        """
458        >>> DiscardDiscriminationContentJa().\
459            apply(Document("<TEST_STRING_OF_DISCRIMINATION_KEYWORD>")).is_rejected
460        True
461
462        >>> DiscardDiscriminationContentJa().apply(Document("ほうじ茶")).is_rejected
463        False
464        """
465        return super().apply(doc)
>>> DiscardDiscriminationContentJa().            apply(Document("<TEST_STRING_OF_DISCRIMINATION_KEYWORD>")).is_rejected
True
>>> DiscardDiscriminationContentJa().apply(Document("ほうじ茶")).is_rejected
False
class DiscardViolenceContentJa(NgWordsFilterJa):
468class DiscardViolenceContentJa(NgWordsFilterJa):
469    """
470    日本語の暴力・脅迫を示唆するキーワードを含む文書を破棄します.
471    `dict_path` で指定したファイルから, キーワードのリストを得ます.
472    ファイルは単語が改行で羅列されたテキストファイルです.
473    デフォルトの`dict_path` は /hojichar/dict/violence_keywords_ja.txt です.
474    """
475
476    def __init__(
477        self,
478        dict_path: Union[str, PathLike] = BASE_PATH / "dict/violence_keywords_ja.txt",
479        *args: Any,
480        **kwargs: Any,
481    ) -> None:
482        super().__init__(dict_path, *args, **kwargs)
483
484    def apply(self, doc: Document) -> Document:
485        """
486        >>> DiscardViolenceContentJa()\
487            .apply(Document("<TEST_STRING_OF_VIOLENCE_KEYWORD>")).is_rejected
488        True
489
490        >>> DiscardViolenceContentJa().apply(Document("ほうじ茶")).is_rejected
491        False
492        """
493        return super().apply(doc)

日本語の暴力・脅迫を示唆するキーワードを含む文書を破棄します. dict_path で指定したファイルから, キーワードのリストを得ます. ファイルは単語が改行で羅列されたテキストファイルです. デフォルトのdict_path は /hojichar/dict/violence_keywords_ja.txt です.

DiscardViolenceContentJa( dict_path: Union[str, os.PathLike] = PosixPath('/home/runner/work/HojiChar/HojiChar/hojichar/dict/violence_keywords_ja.txt'), *args: Any, **kwargs: Any)
476    def __init__(
477        self,
478        dict_path: Union[str, PathLike] = BASE_PATH / "dict/violence_keywords_ja.txt",
479        *args: Any,
480        **kwargs: Any,
481    ) -> None:
482        super().__init__(dict_path, *args, **kwargs)

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
484    def apply(self, doc: Document) -> Document:
485        """
486        >>> DiscardViolenceContentJa()\
487            .apply(Document("<TEST_STRING_OF_VIOLENCE_KEYWORD>")).is_rejected
488        True
489
490        >>> DiscardViolenceContentJa().apply(Document("ほうじ茶")).is_rejected
491        False
492        """
493        return super().apply(doc)
>>> DiscardViolenceContentJa()            .apply(Document("<TEST_STRING_OF_VIOLENCE_KEYWORD>")).is_rejected
True
>>> DiscardViolenceContentJa().apply(Document("ほうじ茶")).is_rejected
False
class DiscardBBSComments(hojichar.core.filter_interface.Filter):
496class DiscardBBSComments(Filter):
497    """
498    正規表現 "BBS Pattern" に `max_allow_num` 回よりたくさんマッチする文書を破棄します.
499    `max_allow_num` のデフォルト値は14です.
500    正規表現 "BBS Pattern" は下記のリンクで検証可能です.
501    https://regex101.com/r/ybQvL2/1
502    """
503
504    def __init__(self, max_allowed_num: int = 14, *args: Any, **kwargs: Any) -> None:
505        super().__init__(*args, **kwargs)
506
507        self.max_allowed_num = max_allowed_num
508        self.keyword_pat = re.compile(
509            r"\d{4}[年\.\-\/][\ ]*\d{1,2}[月\.\-\/][\ ]*\d{1,2}[日]*|コメント|SOLD OUT|レビュー|投稿|ページ|\([月火水木金土日]\)|質問|\d+話|楽天市場|-"  # noqa
510        )
511
512    def apply(self, doc: Document) -> Document:
513        """
514        >>> DiscardBBSComments().apply(Document("楽天市場 質問 投稿 コメント レビュー "*3)).is_rejected
515        True
516
517        >>> DiscardBBSComments().apply(Document("鏡餅")).is_rejected
518        False
519        """
520        bbs_factor = self.keyword_pat.findall(doc.text)
521        if len(bbs_factor) > self.max_allowed_num:
522            doc.is_rejected = True
523        return doc

正規表現 "BBS Pattern" に max_allow_num 回よりたくさんマッチする文書を破棄します. max_allow_num のデフォルト値は14です. 正規表現 "BBS Pattern" は下記のリンクで検証可能です. https://regex101.com/r/ybQvL2/1

DiscardBBSComments(max_allowed_num: int = 14, *args: Any, **kwargs: Any)
504    def __init__(self, max_allowed_num: int = 14, *args: Any, **kwargs: Any) -> None:
505        super().__init__(*args, **kwargs)
506
507        self.max_allowed_num = max_allowed_num
508        self.keyword_pat = re.compile(
509            r"\d{4}[年\.\-\/][\ ]*\d{1,2}[月\.\-\/][\ ]*\d{1,2}[日]*|コメント|SOLD OUT|レビュー|投稿|ページ|\([月火水木金土日]\)|質問|\d+話|楽天市場|-"  # noqa
510        )

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
512    def apply(self, doc: Document) -> Document:
513        """
514        >>> DiscardBBSComments().apply(Document("楽天市場 質問 投稿 コメント レビュー "*3)).is_rejected
515        True
516
517        >>> DiscardBBSComments().apply(Document("鏡餅")).is_rejected
518        False
519        """
520        bbs_factor = self.keyword_pat.findall(doc.text)
521        if len(bbs_factor) > self.max_allowed_num:
522            doc.is_rejected = True
523        return doc
>>> DiscardBBSComments().apply(Document("楽天市場 質問 投稿 コメント レビュー "*3)).is_rejected
True
>>> DiscardBBSComments().apply(Document("鏡餅")).is_rejected
False
class DiscardAds(hojichar.core.filter_interface.Filter):
526class DiscardAds(Filter):
527    """
528    主に広告キーワードを`max_allow_num`より多く含む文書を破棄します.
529    デフォルトで`max_allow_num` は14です.
530    `dict_path` で指定したファイルから, 広告キーワードのリストを得ます.
531    ファイルは単語が改行で羅列されたテキストファイルです.
532    デフォルトの`dict_path` は /hojichar/dict/advertisement_keywords_ja.txt です.
533    """
534
535    def __init__(
536        self,
537        dict_path: Union[str, PathLike] = BASE_PATH / "dict/advertisement_keywords_ja.txt",
538        max_allowed_num: int = 14,
539        *args: Any,
540        **kwargs: Any,
541    ):
542        super().__init__(*args, **kwargs)
543
544        self.max_allow_num = max_allowed_num
545        with open(dict_path, encoding="utf-8") as fp:
546            ng_words = fp.read().split("\n")
547        ng_words = [re.escape(w.strip()) for w in ng_words if not len(w) == 0]
548        pat = r"|".join(ng_words)
549        self.keyword_pat = re.compile(pat)
550
551    def apply(self, doc: Document) -> Document:
552        """
553        >>> DiscardAds().apply(Document("お問い合わせください 営業時間 よくある質問"*5)).is_rejected
554        True
555
556        >>> DiscardAds().apply(Document("おはよう")).is_rejected
557        False
558        """
559        ads_factor = self.keyword_pat.findall(doc.text)
560        if len(ads_factor) > self.max_allow_num:
561            doc.is_rejected = True
562        return doc

主に広告キーワードをmax_allow_numより多く含む文書を破棄します. デフォルトでmax_allow_num は14です. dict_path で指定したファイルから, 広告キーワードのリストを得ます. ファイルは単語が改行で羅列されたテキストファイルです. デフォルトのdict_path は /hojichar/dict/advertisement_keywords_ja.txt です.

DiscardAds( dict_path: Union[str, os.PathLike] = PosixPath('/home/runner/work/HojiChar/HojiChar/hojichar/dict/advertisement_keywords_ja.txt'), max_allowed_num: int = 14, *args: Any, **kwargs: Any)
535    def __init__(
536        self,
537        dict_path: Union[str, PathLike] = BASE_PATH / "dict/advertisement_keywords_ja.txt",
538        max_allowed_num: int = 14,
539        *args: Any,
540        **kwargs: Any,
541    ):
542        super().__init__(*args, **kwargs)
543
544        self.max_allow_num = max_allowed_num
545        with open(dict_path, encoding="utf-8") as fp:
546            ng_words = fp.read().split("\n")
547        ng_words = [re.escape(w.strip()) for w in ng_words if not len(w) == 0]
548        pat = r"|".join(ng_words)
549        self.keyword_pat = re.compile(pat)

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
551    def apply(self, doc: Document) -> Document:
552        """
553        >>> DiscardAds().apply(Document("お問い合わせください 営業時間 よくある質問"*5)).is_rejected
554        True
555
556        >>> DiscardAds().apply(Document("おはよう")).is_rejected
557        False
558        """
559        ads_factor = self.keyword_pat.findall(doc.text)
560        if len(ads_factor) > self.max_allow_num:
561            doc.is_rejected = True
562        return doc
>>> DiscardAds().apply(Document("お問い合わせください 営業時間 よくある質問"*5)).is_rejected
True
>>> DiscardAds().apply(Document("おはよう")).is_rejected
False
class AcceptJapanese(hojichar.core.filter_interface.Filter):
565class AcceptJapanese(Filter):
566    """
567    日本語でないドキュメントを破棄します. 日本語判定は次の手順で行われます.
568        1. テキストを左から`lookup_size` (デフォルトで50字) 参照し,
569        ひらがな・カタカナが存在すれば日本語と判定する.
570    """
571
572    def __init__(self, lookup_size: int = 50, *args: Any, **kwargs: Any) -> None:
573        super().__init__(*args, **kwargs)
574
575        self.lookup_size = lookup_size
576        self.hiragana_katakana_pat = re.compile(r"[ぁ-んァ-ン]")
577
578    def apply(self, doc: Document) -> Document:
579        """
580        >>> AcceptJapanese().apply(Document("This is English document")).is_rejected
581        True
582
583        >>> AcceptJapanese().apply(Document("a"*50 + "あ")).is_rejected
584        True
585
586        >>> AcceptJapanese().apply(Document("ほうじ茶")).is_rejected
587        False
588        """
589        if not self.hiragana_katakana_pat.search(doc.text[: self.lookup_size]):
590            doc.is_rejected = True
591        return doc

日本語でないドキュメントを破棄します. 日本語判定は次の手順で行われます. 1. テキストを左からlookup_size (デフォルトで50字) 参照し, ひらがな・カタカナが存在すれば日本語と判定する.

AcceptJapanese(lookup_size: int = 50, *args: Any, **kwargs: Any)
572    def __init__(self, lookup_size: int = 50, *args: Any, **kwargs: Any) -> None:
573        super().__init__(*args, **kwargs)
574
575        self.lookup_size = lookup_size
576        self.hiragana_katakana_pat = re.compile(r"[ぁ-んァ-ン]")

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
578    def apply(self, doc: Document) -> Document:
579        """
580        >>> AcceptJapanese().apply(Document("This is English document")).is_rejected
581        True
582
583        >>> AcceptJapanese().apply(Document("a"*50 + "あ")).is_rejected
584        True
585
586        >>> AcceptJapanese().apply(Document("ほうじ茶")).is_rejected
587        False
588        """
589        if not self.hiragana_katakana_pat.search(doc.text[: self.lookup_size]):
590            doc.is_rejected = True
591        return doc
>>> AcceptJapanese().apply(Document("This is English document")).is_rejected
True
>>> AcceptJapanese().apply(Document("a"*50 + "あ")).is_rejected
True
>>> AcceptJapanese().apply(Document("ほうじ茶")).is_rejected
False
class DiscardRareKuten(hojichar.core.filter_interface.Filter):
594class DiscardRareKuten(Filter):
595    """
596    日本語でないドキュメントを破棄します. 日本語判定は次の手順で行われます
597    ドキュメントを句点"。"で区切り, 平均文長が
598    `max_avarage_sentence_length` より長い場合は破棄します.
599    `max_avarage_sentence_length` のデフォルト値は100です.
600    このフィルタは, 文章中の句点の割合が少なすぎるドキュメントを破棄します.
601    """
602
603    def __init__(self, max_average_sentence_length: int = 100, *args: Any, **kwargs: Any) -> None:
604        super().__init__(*args, **kwargs)
605
606        self.max_average_sentence_length = max_average_sentence_length
607        self.kuten_pat = re.compile(r"。")
608
609    def apply(self, doc: Document) -> Document:
610        """
611        >>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよ。")).is_rejected
612        False
613        >>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよう。")).is_rejected
614        True
615        """
616        kuten_lst = self.kuten_pat.findall(doc.text)
617        min_kuten_num = len(doc.text) / self.max_average_sentence_length
618        if len(kuten_lst) < min_kuten_num:
619            doc.is_rejected = True
620        return doc

日本語でないドキュメントを破棄します. 日本語判定は次の手順で行われます ドキュメントを句点"。"で区切り, 平均文長が max_avarage_sentence_length より長い場合は破棄します. max_avarage_sentence_length のデフォルト値は100です. このフィルタは, 文章中の句点の割合が少なすぎるドキュメントを破棄します.

DiscardRareKuten(max_average_sentence_length: int = 100, *args: Any, **kwargs: Any)
603    def __init__(self, max_average_sentence_length: int = 100, *args: Any, **kwargs: Any) -> None:
604        super().__init__(*args, **kwargs)
605
606        self.max_average_sentence_length = max_average_sentence_length
607        self.kuten_pat = re.compile(r"。")

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
609    def apply(self, doc: Document) -> Document:
610        """
611        >>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよ。")).is_rejected
612        False
613        >>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよう。")).is_rejected
614        True
615        """
616        kuten_lst = self.kuten_pat.findall(doc.text)
617        min_kuten_num = len(doc.text) / self.max_average_sentence_length
618        if len(kuten_lst) < min_kuten_num:
619            doc.is_rejected = True
620        return doc
>>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよ。")).is_rejected
False
>>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよう。")).is_rejected
True
class HeaderFooterTagsRemover(hojichar.core.filter_interface.Filter):
623class HeaderFooterTagsRemover(Filter):
624    """
625    ドキュメントの冒頭・末尾のトークンを調査し, ヘッダー・フッダー的な
626    タグが存在していた場合, そのトークンを除去します.
627
628    このフィルタを通す前に, 事前にセンテンスレベルにトーカナイズしておいてください.
629    このフィルタでは Document.token にのみ変更が加えられるので, 出力前 あるいは 下流フィルタで
630    Document.text に変更を加える前にトークンをマージしておいてください.
631    """
632
633    def __init__(
634        self,
635        dict_path: Union[str, PathLike] = BASE_PATH / "dict/header_footer_keywords_ja.txt",
636        *args: Any,
637        **kwargs: Any,
638    ) -> None:
639        super().__init__(*args, **kwargs)
640
641        with open(dict_path) as fp:
642            keywords = fp.read().split("\n")
643        keywords = [re.escape(w.strip()) for w in keywords if not len(w) == 0]
644        self.keyword_pat = re.compile(r"|".join(keywords))
645
646    def apply(self, doc: Document) -> Document:
647        if len(doc.tokens) == 0:
648            return doc
649
650        lookup_size = 0
651        if 1 <= len(doc.tokens) < 4:
652            lookup_size = 1
653        elif 4 <= len(doc.tokens) < 6:
654            lookup_size = 2
655        elif 6 <= len(doc.tokens):
656            lookup_size = 3
657
658        for i in range(lookup_size):
659            if self.should_drop_token(doc.tokens[i]):
660                doc.tokens[i].is_rejected = True
661            if self.should_drop_token(doc.tokens[-(i + 1)]):
662                doc.tokens[i].is_rejected = True
663
664        return doc
665
666    def should_drop_token(self, token: Token) -> bool:
667        """
668        >>> HeaderFooterTagsRemover().should_drop_token(Token("<TEST_STRING_OF_KEYWORD>"))
669        True
670
671        >>> HeaderFooterTagsRemover().should_drop_token(Token("ほうじ茶"))
672        False
673
674        Comment.
675        Original legacy code removed a pattern r"« _ | Main | _ »" .
676        In the pattern, "|" is not escaped, so **ANY** string was eliminated.
677        It seems unintended behavior, so I fix this.
678        """
679        if self.keyword_pat.match(token.text):
680            return True
681        else:
682            return False

ドキュメントの冒頭・末尾のトークンを調査し, ヘッダー・フッダー的な タグが存在していた場合, そのトークンを除去します.

このフィルタを通す前に, 事前にセンテンスレベルにトーカナイズしておいてください. このフィルタでは Document.token にのみ変更が加えられるので, 出力前 あるいは 下流フィルタで Document.text に変更を加える前にトークンをマージしておいてください.

HeaderFooterTagsRemover( dict_path: Union[str, os.PathLike] = PosixPath('/home/runner/work/HojiChar/HojiChar/hojichar/dict/header_footer_keywords_ja.txt'), *args: Any, **kwargs: Any)
633    def __init__(
634        self,
635        dict_path: Union[str, PathLike] = BASE_PATH / "dict/header_footer_keywords_ja.txt",
636        *args: Any,
637        **kwargs: Any,
638    ) -> None:
639        super().__init__(*args, **kwargs)
640
641        with open(dict_path) as fp:
642            keywords = fp.read().split("\n")
643        keywords = [re.escape(w.strip()) for w in keywords if not len(w) == 0]
644        self.keyword_pat = re.compile(r"|".join(keywords))

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
646    def apply(self, doc: Document) -> Document:
647        if len(doc.tokens) == 0:
648            return doc
649
650        lookup_size = 0
651        if 1 <= len(doc.tokens) < 4:
652            lookup_size = 1
653        elif 4 <= len(doc.tokens) < 6:
654            lookup_size = 2
655        elif 6 <= len(doc.tokens):
656            lookup_size = 3
657
658        for i in range(lookup_size):
659            if self.should_drop_token(doc.tokens[i]):
660                doc.tokens[i].is_rejected = True
661            if self.should_drop_token(doc.tokens[-(i + 1)]):
662                doc.tokens[i].is_rejected = True
663
664        return doc

Definition of filter behavior.

The document must have a protocol TextContent, and mostly used hojichar.Document class.

In this method, the filter will modify document.text or document.extras and set document.is_rejected = True to discard the document.

Parameters

document : Document Input document

Returns

Document Processed Document

def should_drop_token(self, token: hojichar.core.models.Token) -> bool:
666    def should_drop_token(self, token: Token) -> bool:
667        """
668        >>> HeaderFooterTagsRemover().should_drop_token(Token("<TEST_STRING_OF_KEYWORD>"))
669        True
670
671        >>> HeaderFooterTagsRemover().should_drop_token(Token("ほうじ茶"))
672        False
673
674        Comment.
675        Original legacy code removed a pattern r"« _ | Main | _ »" .
676        In the pattern, "|" is not escaped, so **ANY** string was eliminated.
677        It seems unintended behavior, so I fix this.
678        """
679        if self.keyword_pat.match(token.text):
680            return True
681        else:
682            return False
>>> HeaderFooterTagsRemover().should_drop_token(Token("<TEST_STRING_OF_KEYWORD>"))
True
>>> HeaderFooterTagsRemover().should_drop_token(Token("ほうじ茶"))
False

Comment. Original legacy code removed a pattern r"« _ | Main | _ »" . In the pattern, "|" is not escaped, so ANY string was eliminated. It seems unintended behavior, so I fix this.

class MaskPersonalInformation(hojichar.core.filter_interface.Filter):
685class MaskPersonalInformation(Filter):
686    """
687    ドキュメントに含まれる電話番号・電子メールアドレスを一部マスキングします.
688    """
689
690    def __init__(self, *args: Any, **kwargs: Any) -> None:
691        super().__init__(*args, **kwargs)
692
693        self.phone_pat = re.compile(
694            r"((0|\+\d{1,3}[- ]?)(\d{2}[- ]?\d{4}[- ]?|\d[- ]?\d{4}[- ]?|\d{2}[- ]?\d{3}[- ]?|\d{3}[- ]?\d{2}[- ]?|\d{4}[- ]?\d{1}[- ]?))\d{4}"  # noqa
695        )
696        self.email_pat = re.compile(
697            r"[a-zA-Z0-9!#$%&'*+\-/=?^_`{|}~.]+@[A-Za-z0-9!#$%&'*+\-/=?^_`{|}~.]+(\.[A-Za-z0-9\-]+)"  # noqa
698        )
699
700    def apply(self, doc: Document) -> Document:
701        """
702        >>> MaskPersonalInformation()('06-1234-5678')
703        '06-1234-XXXX'
704        >>> MaskPersonalInformation()('075-123-4567')
705        '075-123-XXXX'
706        >>> MaskPersonalInformation()('0166-12-3456')
707        '0166-12-XXXX'
708        >>> MaskPersonalInformation()('09808-1-2345')
709        '09808-1-XXXX'
710        >>> MaskPersonalInformation()('090-1234-5678')
711        '090-1234-XXXX'
712        >>> MaskPersonalInformation()('0751234567')
713        '075123XXXX'
714        >>> MaskPersonalInformation()('08012345678')
715        '0801234XXXX'
716        >>> MaskPersonalInformation()('連絡は075-123-4567 まで')
717        '連絡は075-123-XXXX まで'
718        >>> MaskPersonalInformation()('+81-80-1234-5678')
719        '+81-80-1234-XXXX'
720        >>> MaskPersonalInformation()('+818012345678')
721        '+81801234XXXX'
722        >>> MaskPersonalInformation()('hogehoge@example.com')
723        'xxxx@yyy.com'
724        >>> MaskPersonalInformation()('何かあれば hogehoge@example.ne.jp まで連絡')
725        '何かあれば xxxx@yyy.jp まで連絡'
726        """
727        text = self.phone_pat.sub(r"\1XXXX", doc.text)
728        text = self.email_pat.sub(r"xxxx@yyy\1", text)
729        doc.text = text
730        return doc

ドキュメントに含まれる電話番号・電子メールアドレスを一部マスキングします.

MaskPersonalInformation(*args: Any, **kwargs: Any)
690    def __init__(self, *args: Any, **kwargs: Any) -> None:
691        super().__init__(*args, **kwargs)
692
693        self.phone_pat = re.compile(
694            r"((0|\+\d{1,3}[- ]?)(\d{2}[- ]?\d{4}[- ]?|\d[- ]?\d{4}[- ]?|\d{2}[- ]?\d{3}[- ]?|\d{3}[- ]?\d{2}[- ]?|\d{4}[- ]?\d{1}[- ]?))\d{4}"  # noqa
695        )
696        self.email_pat = re.compile(
697            r"[a-zA-Z0-9!#$%&'*+\-/=?^_`{|}~.]+@[A-Za-z0-9!#$%&'*+\-/=?^_`{|}~.]+(\.[A-Za-z0-9\-]+)"  # noqa
698        )

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
700    def apply(self, doc: Document) -> Document:
701        """
702        >>> MaskPersonalInformation()('06-1234-5678')
703        '06-1234-XXXX'
704        >>> MaskPersonalInformation()('075-123-4567')
705        '075-123-XXXX'
706        >>> MaskPersonalInformation()('0166-12-3456')
707        '0166-12-XXXX'
708        >>> MaskPersonalInformation()('09808-1-2345')
709        '09808-1-XXXX'
710        >>> MaskPersonalInformation()('090-1234-5678')
711        '090-1234-XXXX'
712        >>> MaskPersonalInformation()('0751234567')
713        '075123XXXX'
714        >>> MaskPersonalInformation()('08012345678')
715        '0801234XXXX'
716        >>> MaskPersonalInformation()('連絡は075-123-4567 まで')
717        '連絡は075-123-XXXX まで'
718        >>> MaskPersonalInformation()('+81-80-1234-5678')
719        '+81-80-1234-XXXX'
720        >>> MaskPersonalInformation()('+818012345678')
721        '+81801234XXXX'
722        >>> MaskPersonalInformation()('hogehoge@example.com')
723        'xxxx@yyy.com'
724        >>> MaskPersonalInformation()('何かあれば hogehoge@example.ne.jp まで連絡')
725        '何かあれば xxxx@yyy.jp まで連絡'
726        """
727        text = self.phone_pat.sub(r"\1XXXX", doc.text)
728        text = self.email_pat.sub(r"xxxx@yyy\1", text)
729        doc.text = text
730        return doc
>>> MaskPersonalInformation()('06-1234-5678')
'06-1234-XXXX'
>>> MaskPersonalInformation()('075-123-4567')
'075-123-XXXX'
>>> MaskPersonalInformation()('0166-12-3456')
'0166-12-XXXX'
>>> MaskPersonalInformation()('09808-1-2345')
'09808-1-XXXX'
>>> MaskPersonalInformation()('090-1234-5678')
'090-1234-XXXX'
>>> MaskPersonalInformation()('0751234567')
'075123XXXX'
>>> MaskPersonalInformation()('08012345678')
'0801234XXXX'
>>> MaskPersonalInformation()('連絡は075-123-4567 まで')
'連絡は075-123-XXXX まで'
>>> MaskPersonalInformation()('+81-80-1234-5678')
'+81-80-1234-XXXX'
>>> MaskPersonalInformation()('+818012345678')
'+81801234XXXX'
>>> MaskPersonalInformation()('hogehoge@example.com')
'xxxx@yyy.com'
>>> MaskPersonalInformation()('何かあれば hogehoge@example.ne.jp まで連絡')
'何かあれば xxxx@yyy.jp まで連絡'
class DiscardTooManyNouns(hojichar.core.filter_interface.Filter):
733class DiscardTooManyNouns(Filter):
734    """
735    [!CAUTION] This filter requires `fugashi` package. Please install it
736    by `pip install 'hojichar[all]'`.
737
738    A filter that removes document with too many nouns in Japanese i.e.,
739    documents such as advertisement, word salad, etc ...
740    """
741
742    def __init__(
743        self, threshold: float = 0.80, max_parse_chars: int = 100_000, *args: Any, **kwargs: Any
744    ) -> None:
745        """
746        Args:
747            threshold: document whose noun ratio is higher than this value will be discarded
748            max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document.
749            *args:
750            **kwargs:
751        """
752        super().__init__(*args, **kwargs)
753        assert is_loaded_extras, (
754            "fugashi is required for this filter. Try pip install 'hojichar[all]'"
755        )
756
757        self.threshold = threshold
758        self.max_parse_chars = max_parse_chars
759        self.tagger = Tagger("-Owakati")
760        assert "unidic" in self.tagger.dictionary_info[0]["filename"], (
761            "MeCab dictionary must be unidic"
762        )
763
764    def _chunk_text(self, text: str) -> Iterable[str]:
765        """Slice text into chunks of `max_parse_chars` length."""
766        step = self.max_parse_chars
767        for i in range(0, len(text), step):
768            yield text[i : i + step]
769
770    def apply(self, doc: Document) -> Document:
771        """
772        >>> DiscardTooManyNouns().apply(Document("自然言語処理大好き!")).is_rejected
773        False
774        >>> DiscardTooManyNouns().apply(Document("リンゴ・オレンジ・ミカン・バナナ セール中")).is_rejected
775        True
776        >>> DiscardTooManyNouns().apply(Document("今日の仙台朝市ではリンゴがセール中")).is_rejected
777        False
778        """
779        # remove "補助記号" from part-of-speech statistics
780        # because they often decrease the noun ratio,
781        # e.g., the sentence "リンゴ・オレンジ・バナナ・" has 補助記号 ratio of 0.5
782        # however, we don't want such sentence
783
784        pos_count: Counter[str] = Counter()
785        for chunk in self._chunk_text(doc.text):
786            for word in self.tagger(chunk):
787                if word.feature.pos1 != "補助記号":
788                    pos_count[word.feature.pos1] += 1
789
790        try:
791            noun_ratio = pos_count["名詞"] / sum(pos_count.values())
792        except ZeroDivisionError:
793            noun_ratio = 0.0
794        if noun_ratio >= self.threshold:
795            doc.is_rejected = True
796        return doc

[!CAUTION] This filter requires fugashi package. Please install it by pip install 'hojichar[all]'.

A filter that removes document with too many nouns in Japanese i.e., documents such as advertisement, word salad, etc ...

DiscardTooManyNouns( threshold: float = 0.8, max_parse_chars: int = 100000, *args: Any, **kwargs: Any)
742    def __init__(
743        self, threshold: float = 0.80, max_parse_chars: int = 100_000, *args: Any, **kwargs: Any
744    ) -> None:
745        """
746        Args:
747            threshold: document whose noun ratio is higher than this value will be discarded
748            max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document.
749            *args:
750            **kwargs:
751        """
752        super().__init__(*args, **kwargs)
753        assert is_loaded_extras, (
754            "fugashi is required for this filter. Try pip install 'hojichar[all]'"
755        )
756
757        self.threshold = threshold
758        self.max_parse_chars = max_parse_chars
759        self.tagger = Tagger("-Owakati")
760        assert "unidic" in self.tagger.dictionary_info[0]["filename"], (
761            "MeCab dictionary must be unidic"
762        )

Args: threshold: document whose noun ratio is higher than this value will be discarded max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document. args: *kwargs:

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
770    def apply(self, doc: Document) -> Document:
771        """
772        >>> DiscardTooManyNouns().apply(Document("自然言語処理大好き!")).is_rejected
773        False
774        >>> DiscardTooManyNouns().apply(Document("リンゴ・オレンジ・ミカン・バナナ セール中")).is_rejected
775        True
776        >>> DiscardTooManyNouns().apply(Document("今日の仙台朝市ではリンゴがセール中")).is_rejected
777        False
778        """
779        # remove "補助記号" from part-of-speech statistics
780        # because they often decrease the noun ratio,
781        # e.g., the sentence "リンゴ・オレンジ・バナナ・" has 補助記号 ratio of 0.5
782        # however, we don't want such sentence
783
784        pos_count: Counter[str] = Counter()
785        for chunk in self._chunk_text(doc.text):
786            for word in self.tagger(chunk):
787                if word.feature.pos1 != "補助記号":
788                    pos_count[word.feature.pos1] += 1
789
790        try:
791            noun_ratio = pos_count["名詞"] / sum(pos_count.values())
792        except ZeroDivisionError:
793            noun_ratio = 0.0
794        if noun_ratio >= self.threshold:
795            doc.is_rejected = True
796        return doc
>>> DiscardTooManyNouns().apply(Document("自然言語処理大好き!")).is_rejected
False
>>> DiscardTooManyNouns().apply(Document("リンゴ・オレンジ・ミカン・バナナ セール中")).is_rejected
True
>>> DiscardTooManyNouns().apply(Document("今日の仙台朝市ではリンゴがセール中")).is_rejected
False
class CharRepetitionRatioFilter(hojichar.core.filter_interface.Filter):
799class CharRepetitionRatioFilter(Filter):
800    """
801    文字Ngramの重なり率(文書中で高頻度文字Ngramが占める割合)を計算して, 重なりの大きいものを除去します.
802    名詞の連続からなるような広告テキストを取り除くのに有効です.
803
804    実装は, BigScience で採用されていた前処理を参考にしています.
805    元実装: https://github.com/bigscience-workshop/data-preparation/blob/9d0588419073cc5bf0fb92b58f37f2a1016572c3/preprocessing/training/01b_oscar_cleaning_and_filtering/filtering.py#L425-L453  # noqa: E501
806
807    「高頻度文字Ngram」は、sqrt(ユニークなNgramの総数)によって求めていますが,
808    これは文書長の影響を軽減するためだとされています.
809
810    掲示板のテキストが引っかかりやすい傾向があります.
811    13: 名無しさん@実況で競馬板アウト 2019/08/18(日) 15:28:46.10 ID:eBvZg8h+0
812    的なものが高頻度で登場するため、文字Ngramの重なり率も高くなってしまう
813    """
814
815    def __init__(
816        self, threshold: float = 0.33, ngram_size: int = 5, *args: Any, **kwargs: Any
817    ) -> None:
818        """
819
820        Args:
821            threshold: document with character repetition ratio higher than this value will be discarded
822            ngram_size: character ngram size. Larger value will decrease the false positive of long documents
823            *args:
824            **kwargs:
825        """  # noqa: E501
826
827        super().__init__(*args, **kwargs)
828        self.threshold = threshold
829        self.ngram_size = ngram_size
830
831    def apply(self, doc: Document) -> Document:
832        ratio = self.compute_character_repetition_ratio(doc.text, self.ngram_size)
833        if ratio >= self.threshold:
834            doc.is_rejected = True
835        return doc
836
837    @staticmethod
838    def compute_character_repetition_ratio(
839        document: str, character_repetition_length: int
840    ) -> float:
841        def get_freq_character_ngrams(document: str, n: int) -> Dict[str, int]:
842            character_ngrams: List[str] = [
843                document[i : i + n] for i in range(len(document) - n + 1)
844            ]
845            freq_character_ngrams_dict: Dict[str, int] = {}
846            for character_ngram in character_ngrams:
847                freq_character_ngrams_dict[character_ngram] = (
848                    freq_character_ngrams_dict.get(character_ngram, 0) + 1
849                )
850            return freq_character_ngrams_dict
851
852        freq_character_ngrams_dict = get_freq_character_ngrams(
853            document, character_repetition_length
854        )
855        if len(freq_character_ngrams_dict) == 0:
856            return 0.0
857        freq_character_ngrams: List[int] = list(freq_character_ngrams_dict.values())
858        freq_character_ngrams = sorted(freq_character_ngrams, reverse=True)
859        val_one = len([el for el in freq_character_ngrams if el == 1])
860        num_rep_character_ngrams = min(
861            int(np.sqrt(len(freq_character_ngrams))),
862            len(freq_character_ngrams) - val_one,
863        )
864        character_repetition_ratio = sum(freq_character_ngrams[:num_rep_character_ngrams]) / sum(
865            freq_character_ngrams
866        )
867        return character_repetition_ratio

文字Ngramの重なり率(文書中で高頻度文字Ngramが占める割合)を計算して, 重なりの大きいものを除去します. 名詞の連続からなるような広告テキストを取り除くのに有効です.

実装は, BigScience で採用されていた前処理を参考にしています. 元実装: https://github.com/bigscience-workshop/data-preparation/blob/9d0588419073cc5bf0fb92b58f37f2a1016572c3/preprocessing/training/01b_oscar_cleaning_and_filtering/filtering.py#L425-L453 # noqa: E501

「高頻度文字Ngram」は、sqrt(ユニークなNgramの総数)によって求めていますが, これは文書長の影響を軽減するためだとされています.

掲示板のテキストが引っかかりやすい傾向があります. 13: 名無しさん@実況で競馬板アウト 2019/08/18(日) 15:28:46.10 ID:eBvZg8h+0 的なものが高頻度で登場するため、文字Ngramの重なり率も高くなってしまう

CharRepetitionRatioFilter( threshold: float = 0.33, ngram_size: int = 5, *args: Any, **kwargs: Any)
815    def __init__(
816        self, threshold: float = 0.33, ngram_size: int = 5, *args: Any, **kwargs: Any
817    ) -> None:
818        """
819
820        Args:
821            threshold: document with character repetition ratio higher than this value will be discarded
822            ngram_size: character ngram size. Larger value will decrease the false positive of long documents
823            *args:
824            **kwargs:
825        """  # noqa: E501
826
827        super().__init__(*args, **kwargs)
828        self.threshold = threshold
829        self.ngram_size = ngram_size

Args: threshold: document with character repetition ratio higher than this value will be discarded ngram_size: character ngram size. Larger value will decrease the false positive of long documents args: *kwargs:

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
831    def apply(self, doc: Document) -> Document:
832        ratio = self.compute_character_repetition_ratio(doc.text, self.ngram_size)
833        if ratio >= self.threshold:
834            doc.is_rejected = True
835        return doc

Definition of filter behavior.

The document must have a protocol TextContent, and mostly used hojichar.Document class.

In this method, the filter will modify document.text or document.extras and set document.is_rejected = True to discard the document.

Parameters

document : Document Input document

Returns

Document Processed Document

@staticmethod
def compute_character_repetition_ratio(document: str, character_repetition_length: int) -> float:
837    @staticmethod
838    def compute_character_repetition_ratio(
839        document: str, character_repetition_length: int
840    ) -> float:
841        def get_freq_character_ngrams(document: str, n: int) -> Dict[str, int]:
842            character_ngrams: List[str] = [
843                document[i : i + n] for i in range(len(document) - n + 1)
844            ]
845            freq_character_ngrams_dict: Dict[str, int] = {}
846            for character_ngram in character_ngrams:
847                freq_character_ngrams_dict[character_ngram] = (
848                    freq_character_ngrams_dict.get(character_ngram, 0) + 1
849                )
850            return freq_character_ngrams_dict
851
852        freq_character_ngrams_dict = get_freq_character_ngrams(
853            document, character_repetition_length
854        )
855        if len(freq_character_ngrams_dict) == 0:
856            return 0.0
857        freq_character_ngrams: List[int] = list(freq_character_ngrams_dict.values())
858        freq_character_ngrams = sorted(freq_character_ngrams, reverse=True)
859        val_one = len([el for el in freq_character_ngrams if el == 1])
860        num_rep_character_ngrams = min(
861            int(np.sqrt(len(freq_character_ngrams))),
862            len(freq_character_ngrams) - val_one,
863        )
864        character_repetition_ratio = sum(freq_character_ngrams[:num_rep_character_ngrams]) / sum(
865            freq_character_ngrams
866        )
867        return character_repetition_ratio
class WordRepetitionRatioFilter(hojichar.core.filter_interface.Filter):
870class WordRepetitionRatioFilter(Filter):
871    """
872    [!CAUTION] This filter requires `fugashi` package. Please install it
873    by `pip install 'hojichar[all]'`.
874
875    単語Ngramの重なり率(文書中で重複する単語Ngramが占める割合)を計算して、重なりの大きいものを弾くためのフィルタ.
876    BigScienceで採用されていた前処理を参考にしている.
877
878    名詞が連打されているような広告テキストを取り除くのに有効な様子
879    まともな文書がたまたま2回繰り返されている場合もあり、これを取り除いて良いのかは分からない
880    例:
881    "ウェブ\n本文: ニコンの上昇率16%超える、今3月期は経常76%の大幅増益見込む(ニコン) 2013年05月10日[minkabu PRESS] - みんなの株式 (みんかぶ)\n2013/05/10(10:57)
882    ニコン<7731.T>が急騰、寄り付き直後に前日比355円高の2537円まで買い上げ
883    られ、上昇率は16%を超えた。外国為替市場で円が1ドル100円台、1ユーロ131円台に入るなど急速に円安が進み、輸出株が軒並み高になる
884    なか、9日取引終了後に発表した前年3月期決算で、今3月期は2ケタ近い増収で大幅増益を見込んだことが買い気を強めさせた。連結売上
885    高は前期比9.8%増の1兆1100億円、経常利益75.8%増の850億円を予想。前期は半導体、電子部品の低迷が足かせになり、2ケタ増収ながら
886    経常46%の大幅減益になったが、レンズ交換式デジタルカメラの拡大や液晶ディスプレイの回復で収益が急回復する。ニコンの株価は10時
887    56分現在2491円(△309円)出所:株経通信(株式会社みんかぶ)\n2013/05/10 - ニコン(7731) の関連ニュース。 ニコン<7731.T>が急騰、寄
888    り付き直後に前日比355円高の2537円まで買い上げられ、上昇率は16%を超えた。外国為替市場で円が1ドル100円台、1ユーロ131円台に入
889    るなど急速に円安が進み、輸出株が軒並み高になるなか、9日取引終了後に発表した前年3月期決算で、今3月期は2ケタ近い増収で大幅増
890    益を見込んだことが買い気を強めさせた。連結売上高は前期比9.8%増の1兆1100億円、経常利益75.8%増の850億円を予想。前期は半導体、
891    電子部品の低迷が足かせになり、2ケタ増収ながら経常46%の大幅減益になったが、レンズ交換式デジタルカメラの拡大や液晶ディスプレ
892    イの回復で収益が急回"
893    """  # noqa: E501
894
895    def __init__(
896        self,
897        threshold: float = 0.40,
898        ngram_size: int = 7,
899        max_parse_chars: int = 100_000,
900        *args: Any,
901        **kwargs: Any,
902    ) -> None:
903        """
904
905        Args:
906            threshold: document whose character repetition ratio is higher than this value will be discarded
907            ngram_size: character ngram size. Larger value will decrease the false positive of long documents
908            max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document.
909            *args:
910            **kwargs:
911        """  # noqa: E501
912        super().__init__(*args, **kwargs)
913        assert is_loaded_extras, (
914            "fugashi is required for this filter. Try pip install 'hojichar[all]'"
915        )
916
917        self.threshold = threshold
918        self.ngram_size = ngram_size
919        self.max_parse_chars = max_parse_chars
920        self.tagger = Tagger("-Owakati")
921
922    def _chunk_text(self, text: str) -> Iterable[str]:
923        """Split text into chunks of `max_parse_chars` length."""
924        step = self.max_parse_chars
925        for i in range(0, len(text), step):
926            yield text[i : i + step]
927
928    def _get_freq_word_ngrams(self, words: List[str], n: int) -> Dict[str, int]:
929        freq: Dict[str, int] = {}
930        if n <= 0 or len(words) < n:
931            return freq
932        for i in range(len(words) - n + 1):
933            key = " ".join(words[i : i + n])
934            freq[key] = freq.get(key, 0) + 1
935        return freq
936
937    def apply(self, doc: Document) -> Document:
938        ratio = self.compute_word_repetition_ratio(doc.text, self.ngram_size)
939        if ratio >= self.threshold:
940            doc.is_rejected = True
941        return doc
942
943    def compute_word_repetition_ratio(self, document: str, n: int) -> float:
944        total_counter: Counter[str] = Counter()
945
946        for chunk in self._chunk_text(document):
947            words = [w.surface for w in self.tagger(chunk)]
948            total_counter.update(self._get_freq_word_ngrams(words, n))
949
950        if not total_counter:
951            return 0.0
952
953        total = sum(total_counter.values())
954        repeated = sum(v for v in total_counter.values() if v > 1)
955        return repeated / total

[!CAUTION] This filter requires fugashi package. Please install it by pip install 'hojichar[all]'.

単語Ngramの重なり率(文書中で重複する単語Ngramが占める割合)を計算して、重なりの大きいものを弾くためのフィルタ.
BigScienceで採用されていた前処理を参考にしている.

名詞が連打されているような広告テキストを取り除くのに有効な様子
まともな文書がたまたま2回繰り返されている場合もあり、これを取り除いて良いのかは分からない
例:
"ウェブ

本文: ニコンの上昇率16%超える、今3月期は経常76%の大幅増益見込む(ニコン) 2013年05月10日[minkabu PRESS] - みんなの株式 (みんかぶ) 2013/05/10(10:57) ニコン<7731.T>が急騰、寄り付き直後に前日比355円高の2537円まで買い上げ られ、上昇率は16%を超えた。外国為替市場で円が1ドル100円台、1ユーロ131円台に入るなど急速に円安が進み、輸出株が軒並み高になる なか、9日取引終了後に発表した前年3月期決算で、今3月期は2ケタ近い増収で大幅増益を見込んだことが買い気を強めさせた。連結売上 高は前期比9.8%増の1兆1100億円、経常利益75.8%増の850億円を予想。前期は半導体、電子部品の低迷が足かせになり、2ケタ増収ながら 経常46%の大幅減益になったが、レンズ交換式デジタルカメラの拡大や液晶ディスプレイの回復で収益が急回復する。ニコンの株価は10時 56分現在2491円(△309円)出所:株経通信(株式会社みんかぶ) 2013/05/10 - ニコン(7731) の関連ニュース。 ニコン<7731.T>が急騰、寄 り付き直後に前日比355円高の2537円まで買い上げられ、上昇率は16%を超えた。外国為替市場で円が1ドル100円台、1ユーロ131円台に入 るなど急速に円安が進み、輸出株が軒並み高になるなか、9日取引終了後に発表した前年3月期決算で、今3月期は2ケタ近い増収で大幅増 益を見込んだことが買い気を強めさせた。連結売上高は前期比9.8%増の1兆1100億円、経常利益75.8%増の850億円を予想。前期は半導体、 電子部品の低迷が足かせになり、2ケタ増収ながら経常46%の大幅減益になったが、レンズ交換式デジタルカメラの拡大や液晶ディスプレ イの回復で収益が急回"

WordRepetitionRatioFilter( threshold: float = 0.4, ngram_size: int = 7, max_parse_chars: int = 100000, *args: Any, **kwargs: Any)
895    def __init__(
896        self,
897        threshold: float = 0.40,
898        ngram_size: int = 7,
899        max_parse_chars: int = 100_000,
900        *args: Any,
901        **kwargs: Any,
902    ) -> None:
903        """
904
905        Args:
906            threshold: document whose character repetition ratio is higher than this value will be discarded
907            ngram_size: character ngram size. Larger value will decrease the false positive of long documents
908            max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document.
909            *args:
910            **kwargs:
911        """  # noqa: E501
912        super().__init__(*args, **kwargs)
913        assert is_loaded_extras, (
914            "fugashi is required for this filter. Try pip install 'hojichar[all]'"
915        )
916
917        self.threshold = threshold
918        self.ngram_size = ngram_size
919        self.max_parse_chars = max_parse_chars
920        self.tagger = Tagger("-Owakati")

Args: threshold: document whose character repetition ratio is higher than this value will be discarded ngram_size: character ngram size. Larger value will decrease the false positive of long documents max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document. args: *kwargs:

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
937    def apply(self, doc: Document) -> Document:
938        ratio = self.compute_word_repetition_ratio(doc.text, self.ngram_size)
939        if ratio >= self.threshold:
940            doc.is_rejected = True
941        return doc

Definition of filter behavior.

The document must have a protocol TextContent, and mostly used hojichar.Document class.

In this method, the filter will modify document.text or document.extras and set document.is_rejected = True to discard the document.

Parameters

document : Document Input document

Returns

Document Processed Document

def compute_word_repetition_ratio(self, document: str, n: int) -> float:
943    def compute_word_repetition_ratio(self, document: str, n: int) -> float:
944        total_counter: Counter[str] = Counter()
945
946        for chunk in self._chunk_text(document):
947            words = [w.surface for w in self.tagger(chunk)]
948            total_counter.update(self._get_freq_word_ngrams(words, n))
949
950        if not total_counter:
951            return 0.0
952
953        total = sum(total_counter.values())
954        repeated = sum(v for v in total_counter.values() if v > 1)
955        return repeated / total
class DiscardTooManySpecialToken(hojichar.core.filter_interface.Filter):
 958class DiscardTooManySpecialToken(Filter):
 959    """
 960    [!CAUTION] This filter requires `emoji` package. Please install it
 961    by `pip install 'hojichar[all]'`.
 962
 963    句読点を含む記号、空白、絵文字、その他特殊な文字を一定の割合以上含むような文書を取り除くためのフィルタ
 964    元実装: BigScience https://github.com/bigscience-workshop/data-preparation/blob/9d0588419073cc5bf0fb92b58f37f2a1016572c3/preprocessing/training/01b_oscar_cleaning_and_filtering/parameters_filtering.py#L5-L16  # noqa: E501
 965    """
 966
 967    def __init__(self, threshold: float = 0.4, *args: Any, **kwargs: Any) -> None:
 968        """
 969
 970        Args:
 971            threshold: document whose special token ratio is higher than this value will be discarded
 972            *args:
 973            **kwargs:
 974        """  # noqa: E501
 975        super().__init__(*args, **kwargs)
 976
 977        # digits are not regarded as special tokens
 978        # otherwise many false positives are made, i.e., good documents discarded
 979        main_special_characters = string.punctuation + string.whitespace  # + string.digits
 980        other_special_characters = (
 981            "’ “— ™ – •‘œ    ˜ ‚ƒ„’“”–▬…✦�­£​•€«»°·═"
 982            "×士^˘⇓()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰……‑≤≥‖"
 983            "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†:⁄♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚"
 984            "゜ʼ≖ʼ¤℃√!?【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖"
 985            "」﴾》�"
 986        )
 987
 988        en_emoji = emoji.EMOJI_DATA.keys()
 989
 990        special_characters_default = set(main_special_characters + other_special_characters)
 991        special_characters_default.update(en_emoji)
 992        self.special_characters = special_characters_default
 993
 994        self.threshold = threshold
 995
 996    def _compute_special_characters_ratio(self, text: str) -> float:
 997        if len(text) == 0:
 998            return 0
 999
1000        special_characters_ratio = len(
1001            [char for char in text if char in self.special_characters]
1002        ) / len(text)
1003        return special_characters_ratio
1004
1005    def apply(self, doc: Document) -> Document:
1006        special_characters_ratio = self._compute_special_characters_ratio(doc.text)
1007
1008        if special_characters_ratio > self.threshold:
1009            doc.is_rejected = True
1010        return doc

[!CAUTION] This filter requires emoji package. Please install it by pip install 'hojichar[all]'.

句読点を含む記号、空白、絵文字、その他特殊な文字を一定の割合以上含むような文書を取り除くためのフィルタ 元実装: BigScience https://github.com/bigscience-workshop/data-preparation/blob/9d0588419073cc5bf0fb92b58f37f2a1016572c3/preprocessing/training/01b_oscar_cleaning_and_filtering/parameters_filtering.py#L5-L16 # noqa: E501

DiscardTooManySpecialToken(threshold: float = 0.4, *args: Any, **kwargs: Any)
967    def __init__(self, threshold: float = 0.4, *args: Any, **kwargs: Any) -> None:
968        """
969
970        Args:
971            threshold: document whose special token ratio is higher than this value will be discarded
972            *args:
973            **kwargs:
974        """  # noqa: E501
975        super().__init__(*args, **kwargs)
976
977        # digits are not regarded as special tokens
978        # otherwise many false positives are made, i.e., good documents discarded
979        main_special_characters = string.punctuation + string.whitespace  # + string.digits
980        other_special_characters = (
981            "’ “— ™ – •‘œ    ˜ ‚ƒ„’“”–▬…✦�­£​•€«»°·═"
982            "×士^˘⇓()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰……‑≤≥‖"
983            "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†:⁄♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚"
984            "゜ʼ≖ʼ¤℃√!?【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖"
985            "」﴾》�"
986        )
987
988        en_emoji = emoji.EMOJI_DATA.keys()
989
990        special_characters_default = set(main_special_characters + other_special_characters)
991        special_characters_default.update(en_emoji)
992        self.special_characters = special_characters_default
993
994        self.threshold = threshold

Args: threshold: document whose special token ratio is higher than this value will be discarded args: *kwargs:

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
1005    def apply(self, doc: Document) -> Document:
1006        special_characters_ratio = self._compute_special_characters_ratio(doc.text)
1007
1008        if special_characters_ratio > self.threshold:
1009            doc.is_rejected = True
1010        return doc

Definition of filter behavior.

The document must have a protocol TextContent, and mostly used hojichar.Document class.

In this method, the filter will modify document.text or document.extras and set document.is_rejected = True to discard the document.

Parameters

document : Document Input document

Returns

Document Processed Document

class SingleCharacterRepetitionFilter(hojichar.core.filter_interface.Filter):
1013class SingleCharacterRepetitionFilter(Filter):
1014    """
1015    単一文字が大量に繰り返されているような文書を取り除くためのフィルタ
1016    そのような文書はノイズである可能性が高いため
1017    参考: BigScienceプロジェクトによると、oscarデータセットの中にバックスラッシュだけを2M個含むような文書が含まれていたらしい
1018    https://github.com/bigscience-workshop/bigscience/blob/master/train/tr8-104B-wide/chronicles.md#2m-backslash-only-samples-in-our-dataset  # noqa: E501
1019    """
1020
1021    def __init__(
1022        self,
1023        threshold: int = 200,
1024        *args: Any,
1025        **kwargs: Any,
1026    ) -> None:
1027        """
1028        Args:
1029            threshold: The document is removed if character is repeated for this value or more
1030            *args:
1031            **kwargs:
1032        """
1033        super().__init__(*args, **kwargs)
1034        self.threshold = threshold
1035
1036    def _is_repeat_contained(self, text: str) -> bool:
1037        groups = groupby(text)
1038        is_repeat_contained = any(sum(1 for _ in group) >= self.threshold for _, group in groups)
1039        return is_repeat_contained
1040
1041    def apply(self, doc: Document) -> Document:
1042        if self._is_repeat_contained(doc.text):
1043            doc.is_rejected = True
1044        return doc

単一文字が大量に繰り返されているような文書を取り除くためのフィルタ そのような文書はノイズである可能性が高いため 参考: BigScienceプロジェクトによると、oscarデータセットの中にバックスラッシュだけを2M個含むような文書が含まれていたらしい https://github.com/bigscience-workshop/bigscience/blob/master/train/tr8-104B-wide/chronicles.md#2m-backslash-only-samples-in-our-dataset # noqa: E501

SingleCharacterRepetitionFilter(threshold: int = 200, *args: Any, **kwargs: Any)
1021    def __init__(
1022        self,
1023        threshold: int = 200,
1024        *args: Any,
1025        **kwargs: Any,
1026    ) -> None:
1027        """
1028        Args:
1029            threshold: The document is removed if character is repeated for this value or more
1030            *args:
1031            **kwargs:
1032        """
1033        super().__init__(*args, **kwargs)
1034        self.threshold = threshold

Args: threshold: The document is removed if character is repeated for this value or more args: *kwargs:

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
1041    def apply(self, doc: Document) -> Document:
1042        if self._is_repeat_contained(doc.text):
1043            doc.is_rejected = True
1044        return doc

Definition of filter behavior.

The document must have a protocol TextContent, and mostly used hojichar.Document class.

In this method, the filter will modify document.text or document.extras and set document.is_rejected = True to discard the document.

Parameters

document : Document Input document

Returns

Document Processed Document

class DiscardTooManyEndingEllipsis(hojichar.core.filter_interface.Filter):
1047class DiscardTooManyEndingEllipsis(Filter):
1048    """
1049    ellipsisで終わるような行が大量に含まれるような文書を取り除くためのフィルタです.
1050    ellipsisとしては ... と … を用いている
1051    同様のフィルタが RedPajama v2で用いられています.
1052
1053    例として, 以下のような文書を検知します.
1054    ```
1055    ペアーズは女性、という驚愕の過食が出ているのをごアラサーですか。時代から付...
1056    バツイチアラフォー 婚活ち女性の特徴と子持な付...
1057    ```
1058
1059    デフォルトではしきい値を0.7としているが, これはC4から0.1%を削るような設定であり、
1060    precisionを重視した設定です.
1061    """
1062
1063    def __init__(
1064        self,
1065        threshold: float = 0.7,
1066        *args: Any,
1067        **kwargs: Any,
1068    ) -> None:
1069        """
1070        Args:
1071            threshold: The document is removed if ratio of lines ending with ellipsis is higher than this value
1072            *args:
1073            **kwargs:
1074        """  # noqa: E501
1075        super().__init__(*args, **kwargs)
1076        self.threshold = threshold
1077        self.ellipsis_pattern = re.compile(r"(\.{3}|…)\n")  # matches ...\n and …\n
1078
1079    def apply(self, doc: Document) -> Document:
1080        ellipsis_count = len(self.ellipsis_pattern.findall(doc.text))
1081        newline_count = max(doc.text.count("\n"), 1)  # avoid zero division
1082        ellipsis_ratio = ellipsis_count / newline_count
1083
1084        if ellipsis_ratio > self.threshold:
1085            doc.is_rejected = True
1086        return doc

ellipsisで終わるような行が大量に含まれるような文書を取り除くためのフィルタです. ellipsisとしては ... と … を用いている 同様のフィルタが RedPajama v2で用いられています.

例として, 以下のような文書を検知します.

ペアーズは女性、という驚愕の過食が出ているのをごアラサーですか。時代から付...
バツイチアラフォー 婚活ち女性の特徴と子持な付...

デフォルトではしきい値を0.7としているが, これはC4から0.1%を削るような設定であり、 precisionを重視した設定です.

DiscardTooManyEndingEllipsis(threshold: float = 0.7, *args: Any, **kwargs: Any)
1063    def __init__(
1064        self,
1065        threshold: float = 0.7,
1066        *args: Any,
1067        **kwargs: Any,
1068    ) -> None:
1069        """
1070        Args:
1071            threshold: The document is removed if ratio of lines ending with ellipsis is higher than this value
1072            *args:
1073            **kwargs:
1074        """  # noqa: E501
1075        super().__init__(*args, **kwargs)
1076        self.threshold = threshold
1077        self.ellipsis_pattern = re.compile(r"(\.{3}|…)\n")  # matches ...\n and …\n

Args: threshold: The document is removed if ratio of lines ending with ellipsis is higher than this value args: *kwargs:

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
1079    def apply(self, doc: Document) -> Document:
1080        ellipsis_count = len(self.ellipsis_pattern.findall(doc.text))
1081        newline_count = max(doc.text.count("\n"), 1)  # avoid zero division
1082        ellipsis_ratio = ellipsis_count / newline_count
1083
1084        if ellipsis_ratio > self.threshold:
1085            doc.is_rejected = True
1086        return doc

Definition of filter behavior.

The document must have a protocol TextContent, and mostly used hojichar.Document class.

In this method, the filter will modify document.text or document.extras and set document.is_rejected = True to discard the document.

Parameters

document : Document Input document

Returns

Document Processed Document

class DiscardTooShortLines(hojichar.core.filter_interface.Filter):
1089class DiscardTooShortLines(Filter):
1090    """
1091    短い行を大量に含む文書を捨てるためのフィルタです.
1092
1093    メニューバーやパンくずリストのような要素を大量に含む文書を取り除くのに有効です.
1094    """
1095
1096    def __init__(self, threshold: float = 0.5, *args: Any, **kwargs: Any) -> None:
1097        """
1098        Args:
1099            threshold: The document is removed if the ratio of short (<10 chars) lines are more than this value.
1100            *args:
1101            **kwargs:
1102        """  # noqa: E501
1103        super().__init__(*args, **kwargs)
1104        self.threshold = threshold
1105        # この値は適当に決め打ち
1106        self.minimum_line_length = 10
1107
1108    def apply(self, doc: Document) -> Document:
1109        lines = [len(x) for x in doc.text.split("\n")]
1110        short_lines = [x for x in lines if x <= self.minimum_line_length]
1111        if (len(short_lines) / len(lines)) > self.threshold:
1112            doc.is_rejected = True
1113        return doc

短い行を大量に含む文書を捨てるためのフィルタです.

メニューバーやパンくずリストのような要素を大量に含む文書を取り除くのに有効です.

DiscardTooShortLines(threshold: float = 0.5, *args: Any, **kwargs: Any)
1096    def __init__(self, threshold: float = 0.5, *args: Any, **kwargs: Any) -> None:
1097        """
1098        Args:
1099            threshold: The document is removed if the ratio of short (<10 chars) lines are more than this value.
1100            *args:
1101            **kwargs:
1102        """  # noqa: E501
1103        super().__init__(*args, **kwargs)
1104        self.threshold = threshold
1105        # この値は適当に決め打ち
1106        self.minimum_line_length = 10

Args: threshold: The document is removed if the ratio of short (<10 chars) lines are more than this value. args: *kwargs:

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
1108    def apply(self, doc: Document) -> Document:
1109        lines = [len(x) for x in doc.text.split("\n")]
1110        short_lines = [x for x in lines if x <= self.minimum_line_length]
1111        if (len(short_lines) / len(lines)) > self.threshold:
1112            doc.is_rejected = True
1113        return doc

Definition of filter behavior.

The document must have a protocol TextContent, and mostly used hojichar.Document class.

In this method, the filter will modify document.text or document.extras and set document.is_rejected = True to discard the document.

Parameters

document : Document Input document

Returns

Document Processed Document