hojichar.filters.document_filters

   1import json
   2import logging
   3import pathlib
   4import re
   5import string
   6import time
   7import unicodedata
   8from collections import Counter
   9from itertools import groupby
  10from os import PathLike
  11from typing import Any, Dict, Iterable, List, Optional, Union
  12
  13import numpy as np
  14
  15import hojichar
  16from hojichar.core.filter_interface import Filter
  17from hojichar.core.models import Document, Token
  18
  19try:
  20    import emoji
  21    from fugashi import Tagger  # type: ignore
  22
  23    is_loaded_extras = True
  24except ImportError:
  25    is_loaded_extras = False
  26
  27BASE_PATH = pathlib.Path(hojichar.__path__[0])
  28logger = logging.getLogger(__name__)
  29
  30
  31class ExampleHojiChar(Filter):
  32    """基本的なフィルタの実装例です. 末尾に'<hojichar>'を追加します."""
  33
  34    def apply(self, document: Document) -> Document:
  35        """
  36        >>> ExampleHojiChar()("hello, world")
  37        'hello, world<hojichar>'
  38        """
  39        document.text += "<hojichar>"
  40        return document
  41
  42
  43class ExampleDiscardDocumentContainKeyword(Filter):
  44    """特定のキーワードを持つドキュメントを破棄するようなフィルタの実装例です."""
  45
  46    def __init__(self, keyword: str, *args: Any, **kwargs: Any) -> None:
  47        super().__init__(*args, **kwargs)
  48        self.keyword = keyword
  49
  50    def apply(self, document: Document) -> Document:
  51        """
  52        >>> ExampleDiscardDocumentContainKeyword("バカ").apply(Document("あいつはバカだ")).is_rejected
  53        True
  54        """
  55        if self.keyword in document.text:
  56            document.is_rejected = True
  57        return document
  58
  59
  60class Identity(Filter):
  61    """何も変化を加えないフィルタです. テスト・デバッグに用いられます."""
  62
  63    def apply(self, document: Document) -> Document:
  64        return document
  65
  66
  67class DiscardAll(Filter):
  68    """
  69    すべてのドキュメントを破棄するフィルタです.
  70    テスト・デバッグに用いられます.
  71    """
  72
  73    def apply(self, document: Document) -> Document:
  74        document.is_rejected = True
  75        return document
  76
  77
  78class ApplyDiscard(Filter):
  79    """
  80    上流フィルタで破棄された`Document`を空文字列にします.
  81
  82    `Document.is_rejected=True` の ドキュメントは無視されるため,
  83    このフィルタを `Compose` のコンストラクタに渡しても動作しません.
  84    このフィルタは主に`Compose` 内部や, `discard_filtered=False` を指定
  85    したデバッグ時などに利用されます.
  86    """
  87
  88    def __init__(self, *args: Any, **kwargs: Any) -> None:
  89        super().__init__(*args, **kwargs)
  90
  91    def apply(self, document: Document) -> Document:
  92        """
  93        >>> ApplyDiscard().apply(Document(text="hello", is_rejected=True)).text
  94        ''
  95        """
  96        if document.is_rejected:
  97            document.text = ""
  98
  99        return document
 100
 101
 102class Sleep(Filter):
 103    """
 104    デバッグ用のフィルタです. 指定秒スリープします.
 105    """
 106
 107    def __init__(self, time: float = 1.0, *args: Any, **kwargs: Any) -> None:
 108        super().__init__(*args, **kwargs)
 109        self.time = time
 110
 111    def apply(self, document: Document) -> Document:
 112        """
 113        >>> Sleep(0.1)('hello')  # After 0.1 seconds,
 114        'hello'
 115        """
 116        time.sleep(self.time)
 117        return document
 118
 119
 120class DocumentNormalizer(Filter):
 121    """
 122    Unicode の正規化をします.
 123    """
 124
 125    def __init__(self, *args: Any, **kwargs: Any) -> None:
 126        super().__init__(*args, **kwargs)
 127
 128    def apply(self, document: Document) -> Document:
 129        document.text = unicodedata.normalize("NFKC", document.text)
 130        return document
 131
 132
 133class JSONLoader(Filter):
 134    """
 135    テキストを Json として解釈し, `key` で指定した要素を文字列として
 136    doument に格納します.デフォルトの `key` は 'text' です.
 137
 138    Json の読み込み, あるいは `key` の読み込みに失敗した際には例外を送出します.
 139    これらを無視する場合は, `ignore=True` にします. その際, 読み込みに失敗
 140    したドキュメントは破棄されます.
 141    """
 142
 143    def __init__(
 144        self,
 145        key: str = "text",
 146        ignore: bool = False,
 147        extra_keys: Optional[List[str]] = None,
 148        *args: Any,
 149        **kwargs: Any,
 150    ) -> None:
 151        super().__init__(*args, **kwargs)
 152        self.key = key
 153        self.ignore = ignore
 154        self.extra_keys = extra_keys
 155
 156    def apply(self, document: Document) -> Document:
 157        """
 158        >>> JSONLoader()( '{"text": "hello, world", "words": 2}' )
 159        'hello, world'
 160
 161        >>> JSONLoader()( '{"text": hello, world ....' ) # Broken JSON
 162        Traceback (most recent call last):
 163            ...
 164        json.decoder.JSONDecodeError: Expecting value: line 1 column 10 (char 9)
 165
 166        >>> JSONLoader()( '{"words": 2}' )
 167        Traceback (most recent call last):
 168            ...
 169        KeyError: 'text'
 170
 171        >>> JSONLoader(ignore=True).apply(Document('{"text": hello, world ....' )).is_rejected
 172        True
 173        """
 174        try:
 175            data = json.loads(document.text)
 176            document.text = str(data[self.key])
 177            if self.extra_keys is not None:
 178                document.extras = {key: data[key] for key in self.extra_keys if key in data}
 179        except Exception as e:
 180            logger.error(f"Failed to parsing in JSONLoader. Input document: \n{document.text}")
 181            if self.ignore:
 182                document.is_rejected = True
 183                return document
 184            else:
 185                raise e
 186
 187        return document
 188
 189
 190class JSONDumper(Filter):
 191    """
 192    Document.text の文字列を json に変換します.
 193    必要に応じ Document のメタデータを付与します. これはドキュメントの破棄事由が含まれ、偽陽性の分析に有効です。
 194    デフォルトで `skip_rejected` が `False` にセットされており、Document の破棄フラグにかかわらず
 195    処理されます。
 196    """
 197
 198    def __init__(
 199        self,
 200        dump_reason: bool = False,
 201        p: float = 1,
 202        skip_rejected: bool = False,
 203        export_extras: bool = False,
 204        *args: Any,
 205        **kwargs: Any,
 206    ) -> None:
 207        """
 208        Args:
 209            dump_reason (bool, optional): `is_rejected`, `reason` エントリをダンプします. Defaults to False.
 210            p (float, optional): Apply probability. Defaults to 1.
 211            skip_rejected (bool, optional): 破棄済みサンプルを排除しません.
 212        """
 213        super().__init__(p, skip_rejected, *args, **kwargs)
 214        self.dump_reason = dump_reason
 215        self.export_extras = export_extras
 216
 217    def apply(self, document: Document) -> Document:
 218        """
 219        >>> JSONDumper()("hojichar")
 220        '{"text": "hojichar"}'
 221        """
 222        text = document.text
 223        if self.dump_reason:
 224            if self.export_extras:
 225                output_extras = {k: v for k, v in document.extras.items() if k != "__init_stats"}
 226                document.text = json.dumps(
 227                    {
 228                        "text": text,
 229                        "is_rejected": document.is_rejected,
 230                        "reason": document.reject_reason,
 231                        "extras": output_extras,
 232                    },
 233                    ensure_ascii=False,
 234                )
 235            else:
 236                document.text = json.dumps(
 237                    {
 238                        "text": text,
 239                        "is_rejected": document.is_rejected,
 240                        "reason": document.reject_reason,
 241                    },
 242                    ensure_ascii=False,
 243                )
 244        else:
 245            if self.export_extras:
 246                output_extras = {k: v for k, v in document.extras.items() if k != "__init_stats"}
 247                document.text = json.dumps(
 248                    {
 249                        "text": text,
 250                        "extras": output_extras,
 251                    },
 252                    ensure_ascii=False,
 253                )
 254            else:
 255                document.text = json.dumps({"text": text}, ensure_ascii=False)
 256        return document
 257
 258
 259class DocumentLengthFilter(Filter):
 260    """
 261    `min_doc_len`, `max_doc_len` で指定した上限・下限の範囲内にないドキュメントを破棄します.
 262    デフォルトでは 200字 以上 50000字以内のテキストが受理されます.
 263    """
 264
 265    def __init__(
 266        self,
 267        min_doc_len: Optional[int] = None,
 268        max_doc_len: Optional[int] = None,
 269        *args: Any,
 270        **kwargs: Any,
 271    ) -> None:
 272        super().__init__(*args, **kwargs)
 273
 274        self.min_doc_len = min_doc_len
 275        self.max_doc_len = max_doc_len
 276
 277    def apply(self, doc: Document) -> Document:
 278        """
 279        >>> DocumentLengthFilter(min_doc_len=5).apply(Document("1234")).is_rejected
 280        True
 281        """
 282        doc_len = len(doc.text)
 283        if self.min_doc_len is not None:
 284            if doc_len < self.min_doc_len:
 285                doc.is_rejected = True
 286        if self.max_doc_len is not None:
 287            if self.max_doc_len < doc_len:
 288                doc.is_rejected = True
 289        return doc
 290
 291
 292class NgWordsFilterJa(Filter):
 293    """
 294    日本語のNGワード(および不適切語)を含む文書を破棄します.
 295    `dict_path` で指定したファイルから, キーワードのリストを得ます.
 296    ファイルは単語が改行で羅列されたテキストファイルです.
 297
 298    `ignore_confused` を `True` にすると,
 299    偽陽性を軽減するために, カタカナのNGワードは前後にカタカナが無い場合のみNG判定されます.
 300    デフォルト値は `False` です.
 301    """
 302
 303    def __init__(
 304        self,
 305        dict_path: Union[str, PathLike],
 306        ignore_confused: bool = False,
 307        *args: Any,
 308        **kwargs: Any,
 309    ) -> None:
 310        super().__init__(*args, **kwargs)
 311
 312        with open(dict_path, encoding="utf-8") as fp:
 313            ng_words = fp.read().split("\n")
 314        ng_words = [w.strip() for w in ng_words if not len(w) == 0]
 315
 316        if ignore_confused:
 317            words_katakana = []
 318            words_not_katakana = []
 319            for w in ng_words:
 320                if re.fullmatch(r"[ァ-ヴー]+", w):
 321                    words_katakana.append(re.escape(w))
 322                else:
 323                    words_not_katakana.append(re.escape(w))
 324            katakana_pat = "|".join(words_katakana)
 325            katakana_pat = rf"(?<![ァ-ヴー])({katakana_pat})(?![ァ-ヴー])"
 326            pat = "|".join(words_not_katakana) + "|" + katakana_pat
 327            self.keyword_pat = re.compile(pat)
 328        else:
 329            ng_words = [re.escape(w) for w in ng_words]
 330            pat = "|".join(ng_words)
 331            self.keyword_pat = re.compile(pat)
 332
 333    def apply(self, doc: Document) -> Document:
 334        regex_match = self.keyword_pat.search(doc.text)
 335        if regex_match:
 336            doc.is_rejected = True
 337            self.matched_text = regex_match.group()
 338            self.matched_text_neighbor = doc.text[
 339                regex_match.start() - 20 : regex_match.end() + 20
 340            ]
 341
 342        return doc
 343
 344
 345class NgWordsFilterEn(Filter):
 346    """
 347    英語のNGワード(および不適切語)を含む文書を破棄します.
 348    `dict_path` で指定したファイルから, キーワードのリストを得ます.
 349    ファイルは単語が改行で羅列されたテキストファイルです.
 350    """
 351
 352    def __init__(self, dict_path: Union[str, PathLike], *args: Any, **kwargs: Any) -> None:
 353        super().__init__(*args, **kwargs)
 354
 355        with open(dict_path, encoding="utf-8") as fp:
 356            ng_words = fp.read().split("\n")
 357        ng_words = [re.escape(w.strip()) for w in ng_words if not len(w) == 0]
 358        pat = "|".join(ng_words)
 359        # 英語のパターンにマッチするようにしている, \s[単語]\s や [単語]. [単語], などにマッチ.
 360        self.keyword_pat = re.compile(rf"(?:^| )({pat})(?:( |,|\.)|$)", re.IGNORECASE)
 361
 362    def apply(self, doc: Document) -> Document:
 363        if self.keyword_pat.search(doc.text):
 364            doc.is_rejected = True
 365        return doc
 366
 367
 368class DiscardAdultContentJa(NgWordsFilterJa):
 369    """
 370    日本語のアダルトキーワード(および不適切語)を含む文書を破棄します.
 371    `dict_path` で指定したファイルから, キーワードのリストを得ます.
 372    ファイルは単語が改行で羅列されたテキストファイルです.
 373    デフォルトの`dict_path` は /hojichar/dict/adult_keywords_ja.txt です.
 374    """
 375
 376    def __init__(
 377        self,
 378        dict_path: Union[str, PathLike] = BASE_PATH / "dict/adult_keywords_ja.txt",
 379        *args: Any,
 380        **kwargs: Any,
 381    ) -> None:
 382        super().__init__(dict_path, *args, **kwargs)
 383
 384    def apply(self, doc: Document) -> Document:
 385        """
 386        >>> DiscardAdultContentJa().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected
 387        True
 388
 389        >>> DiscardAdultContentJa().apply(Document("ほうじ茶")).is_rejected
 390        False
 391
 392        挙動は正しいが誤検知しているケース. 他にも, サック in リュックサック,
 393        >>> DiscardAdultContentJa().apply(Document("アスパラガス")).is_rejected \
 394        # Matching with NG keyword "アス"
 395        True
 396        """
 397        return super().apply(doc)
 398
 399
 400class DiscardAdultContentEn(NgWordsFilterEn):
 401    """
 402    英語のアダルトキーワード(および不適切語)を含む文書を破棄します.
 403    `dict_path` で指定したファイルから, キーワードのリストを得ます.
 404    ファイルは単語が改行で羅列されたテキストファイルです.
 405    デフォルトの`dict_path` は /hojichar/dict/adult_keywords_en.txt です.
 406    """
 407
 408    def __init__(
 409        self,
 410        dict_path: Union[str, PathLike] = BASE_PATH / "dict/adult_keywords_en.txt",
 411        *args: Any,
 412        **kwargs: Any,
 413    ) -> None:
 414        super().__init__(dict_path, *args, **kwargs)
 415
 416    def apply(self, doc: Document) -> Document:
 417        """
 418        >>> DiscardAdultContentEn().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected
 419        True
 420
 421        >>> DiscardAdultContentEn().apply(Document("hojichar")).is_rejected
 422        False
 423        """
 424        return super().apply(doc)
 425
 426
 427class DiscardDiscriminationContentJa(NgWordsFilterJa):
 428    """
 429    日本語の差別キーワード(および不適切語)を含む文書を破棄します.
 430    `dict_path` で指定したファイルから, キーワードのリストを得ます.
 431    ファイルは単語が改行で羅列されたテキストファイルです.
 432    デフォルトの`dict_path` は /hojichar/dict/discrimination_keywords_ja.txt です.
 433    """
 434
 435    def __init__(
 436        self,
 437        dict_path: Union[str, PathLike] = BASE_PATH / "dict/discrimination_keywords_ja.txt",
 438        *args: Any,
 439        **kwargs: Any,
 440    ):
 441        super().__init__(dict_path, *args, **kwargs)
 442
 443    def apply(self, doc: Document) -> Document:
 444        """
 445        >>> DiscardDiscriminationContentJa().\
 446            apply(Document("<TEST_STRING_OF_DISCRIMINATION_KEYWORD>")).is_rejected
 447        True
 448
 449        >>> DiscardDiscriminationContentJa().apply(Document("ほうじ茶")).is_rejected
 450        False
 451        """
 452        return super().apply(doc)
 453
 454
 455class DiscardViolenceContentJa(NgWordsFilterJa):
 456    """
 457    日本語の暴力・脅迫を示唆するキーワードを含む文書を破棄します.
 458    `dict_path` で指定したファイルから, キーワードのリストを得ます.
 459    ファイルは単語が改行で羅列されたテキストファイルです.
 460    デフォルトの`dict_path` は /hojichar/dict/violence_keywords_ja.txt です.
 461    """
 462
 463    def __init__(
 464        self,
 465        dict_path: Union[str, PathLike] = BASE_PATH / "dict/violence_keywords_ja.txt",
 466        *args: Any,
 467        **kwargs: Any,
 468    ) -> None:
 469        super().__init__(dict_path, *args, **kwargs)
 470
 471    def apply(self, doc: Document) -> Document:
 472        """
 473        >>> DiscardViolenceContentJa()\
 474            .apply(Document("<TEST_STRING_OF_VIOLENCE_KEYWORD>")).is_rejected
 475        True
 476
 477        >>> DiscardViolenceContentJa().apply(Document("ほうじ茶")).is_rejected
 478        False
 479        """
 480        return super().apply(doc)
 481
 482
 483class DiscardBBSComments(Filter):
 484    """
 485    正規表現 "BBS Pattern" に `max_allow_num` 回よりたくさんマッチする文書を破棄します.
 486    `max_allow_num` のデフォルト値は14です.
 487    正規表現 "BBS Pattern" は下記のリンクで検証可能です.
 488    https://regex101.com/r/ybQvL2/1
 489    """
 490
 491    def __init__(self, max_allowed_num: int = 14, *args: Any, **kwargs: Any) -> None:
 492        super().__init__(*args, **kwargs)
 493
 494        self.max_allowed_num = max_allowed_num
 495        self.keyword_pat = re.compile(
 496            r"\d{4}[年\.\-\/][\ ]*\d{1,2}[月\.\-\/][\ ]*\d{1,2}[日]*|コメント|SOLD OUT|レビュー|投稿|ページ|\([月火水木金土日]\)|質問|\d+話|楽天市場|-"  # noqa
 497        )
 498
 499    def apply(self, doc: Document) -> Document:
 500        """
 501        >>> DiscardBBSComments().apply(Document("楽天市場 質問 投稿 コメント レビュー "*3)).is_rejected
 502        True
 503
 504        >>> DiscardBBSComments().apply(Document("鏡餅")).is_rejected
 505        False
 506        """
 507        bbs_factor = self.keyword_pat.findall(doc.text)
 508        if len(bbs_factor) > self.max_allowed_num:
 509            doc.is_rejected = True
 510        return doc
 511
 512
 513class DiscardAds(Filter):
 514    """
 515    主に広告キーワードを`max_allow_num`より多く含む文書を破棄します.
 516    デフォルトで`max_allow_num` は14です.
 517    `dict_path` で指定したファイルから, 広告キーワードのリストを得ます.
 518    ファイルは単語が改行で羅列されたテキストファイルです.
 519    デフォルトの`dict_path` は /hojichar/dict/advertisement_keywords_ja.txt です.
 520    """
 521
 522    def __init__(
 523        self,
 524        dict_path: Union[str, PathLike] = BASE_PATH / "dict/advertisement_keywords_ja.txt",
 525        max_allowed_num: int = 14,
 526        *args: Any,
 527        **kwargs: Any,
 528    ):
 529        super().__init__(*args, **kwargs)
 530
 531        self.max_allow_num = max_allowed_num
 532        with open(dict_path, encoding="utf-8") as fp:
 533            ng_words = fp.read().split("\n")
 534        ng_words = [re.escape(w.strip()) for w in ng_words if not len(w) == 0]
 535        pat = r"|".join(ng_words)
 536        self.keyword_pat = re.compile(pat)
 537
 538    def apply(self, doc: Document) -> Document:
 539        """
 540        >>> DiscardAds().apply(Document("お問い合わせください 営業時間 よくある質問"*5)).is_rejected
 541        True
 542
 543        >>> DiscardAds().apply(Document("おはよう")).is_rejected
 544        False
 545        """
 546        ads_factor = self.keyword_pat.findall(doc.text)
 547        if len(ads_factor) > self.max_allow_num:
 548            doc.is_rejected = True
 549        return doc
 550
 551
 552class AcceptJapanese(Filter):
 553    """
 554    日本語でないドキュメントを破棄します. 日本語判定は次の手順で行われます.
 555        1. テキストを左から`lookup_size` (デフォルトで50字) 参照し,
 556        ひらがな・カタカナが存在すれば日本語と判定する.
 557    """
 558
 559    def __init__(self, lookup_size: int = 50, *args: Any, **kwargs: Any) -> None:
 560        super().__init__(*args, **kwargs)
 561
 562        self.lookup_size = lookup_size
 563        self.hiragana_katakana_pat = re.compile(r"[ぁ-んァ-ン]")
 564
 565    def apply(self, doc: Document) -> Document:
 566        """
 567        >>> AcceptJapanese().apply(Document("This is English document")).is_rejected
 568        True
 569
 570        >>> AcceptJapanese().apply(Document("a"*50 + "あ")).is_rejected
 571        True
 572
 573        >>> AcceptJapanese().apply(Document("ほうじ茶")).is_rejected
 574        False
 575        """
 576        if not self.hiragana_katakana_pat.search(doc.text[: self.lookup_size]):
 577            doc.is_rejected = True
 578        return doc
 579
 580
 581class DiscardRareKuten(Filter):
 582    """
 583    日本語でないドキュメントを破棄します. 日本語判定は次の手順で行われます
 584    ドキュメントを句点"。"で区切り, 平均文長が
 585    `max_avarage_sentence_length` より長い場合は破棄します.
 586    `max_avarage_sentence_length` のデフォルト値は100です.
 587    このフィルタは, 文章中の句点の割合が少なすぎるドキュメントを破棄します.
 588    """
 589
 590    def __init__(self, max_average_sentence_length: int = 100, *args: Any, **kwargs: Any) -> None:
 591        super().__init__(*args, **kwargs)
 592
 593        self.max_average_sentence_length = max_average_sentence_length
 594        self.kuten_pat = re.compile(r"。")
 595
 596    def apply(self, doc: Document) -> Document:
 597        """
 598        >>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよ。")).is_rejected
 599        False
 600        >>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよう。")).is_rejected
 601        True
 602        """
 603        kuten_lst = self.kuten_pat.findall(doc.text)
 604        min_kuten_num = len(doc.text) / self.max_average_sentence_length
 605        if len(kuten_lst) < min_kuten_num:
 606            doc.is_rejected = True
 607        return doc
 608
 609
 610class HeaderFooterTagsRemover(Filter):
 611    """
 612    ドキュメントの冒頭・末尾のトークンを調査し, ヘッダー・フッダー的な
 613    タグが存在していた場合, そのトークンを除去します.
 614
 615    このフィルタを通す前に, 事前にセンテンスレベルにトーカナイズしておいてください.
 616    このフィルタでは Document.token にのみ変更が加えられるので, 出力前 あるいは 下流フィルタで
 617    Document.text に変更を加える前にトークンをマージしておいてください.
 618    """
 619
 620    def __init__(
 621        self,
 622        dict_path: Union[str, PathLike] = BASE_PATH / "dict/header_footer_keywords_ja.txt",
 623        *args: Any,
 624        **kwargs: Any,
 625    ) -> None:
 626        super().__init__(*args, **kwargs)
 627
 628        with open(dict_path) as fp:
 629            keywords = fp.read().split("\n")
 630        keywords = [re.escape(w.strip()) for w in keywords if not len(w) == 0]
 631        self.keyword_pat = re.compile(r"|".join(keywords))
 632
 633    def apply(self, doc: Document) -> Document:
 634        if len(doc.tokens) == 0:
 635            return doc
 636
 637        lookup_size = 0
 638        if 1 <= len(doc.tokens) < 4:
 639            lookup_size = 1
 640        elif 4 <= len(doc.tokens) < 6:
 641            lookup_size = 2
 642        elif 6 <= len(doc.tokens):
 643            lookup_size = 3
 644
 645        for i in range(lookup_size):
 646            if self.should_drop_token(doc.tokens[i]):
 647                doc.tokens[i].is_rejected = True
 648            if self.should_drop_token(doc.tokens[-(i + 1)]):
 649                doc.tokens[i].is_rejected = True
 650
 651        return doc
 652
 653    def should_drop_token(self, token: Token) -> bool:
 654        """
 655        >>> HeaderFooterTagsRemover().should_drop_token(Token("<TEST_STRING_OF_KEYWORD>"))
 656        True
 657
 658        >>> HeaderFooterTagsRemover().should_drop_token(Token("ほうじ茶"))
 659        False
 660
 661        Comment.
 662        Original legacy code removed a pattern r"« _ | Main | _ »" .
 663        In the pattern, "|" is not escaped, so **ANY** string was eliminated.
 664        It seems unintended behavior, so I fix this.
 665        """
 666        if self.keyword_pat.match(token.text):
 667            return True
 668        else:
 669            return False
 670
 671
 672class MaskPersonalInformation(Filter):
 673    """
 674    ドキュメントに含まれる電話番号・電子メールアドレスを一部マスキングします.
 675    """
 676
 677    def __init__(self, *args: Any, **kwargs: Any) -> None:
 678        super().__init__(*args, **kwargs)
 679
 680        self.phone_pat = re.compile(
 681            r"((0|\+\d{1,3}[- ]?)(\d{2}[- ]?\d{4}[- ]?|\d[- ]?\d{4}[- ]?|\d{2}[- ]?\d{3}[- ]?|\d{3}[- ]?\d{2}[- ]?|\d{4}[- ]?\d{1}[- ]?))\d{4}"  # noqa
 682        )
 683        self.email_pat = re.compile(
 684            r"[a-zA-Z0-9!#$%&'*+\-/=?^_`{|}~.]+@[A-Za-z0-9!#$%&'*+\-/=?^_`{|}~.]+(\.[A-Za-z0-9\-]+)"  # noqa
 685        )
 686
 687    def apply(self, doc: Document) -> Document:
 688        """
 689        >>> MaskPersonalInformation()('06-1234-5678')
 690        '06-1234-XXXX'
 691        >>> MaskPersonalInformation()('075-123-4567')
 692        '075-123-XXXX'
 693        >>> MaskPersonalInformation()('0166-12-3456')
 694        '0166-12-XXXX'
 695        >>> MaskPersonalInformation()('09808-1-2345')
 696        '09808-1-XXXX'
 697        >>> MaskPersonalInformation()('090-1234-5678')
 698        '090-1234-XXXX'
 699        >>> MaskPersonalInformation()('0751234567')
 700        '075123XXXX'
 701        >>> MaskPersonalInformation()('08012345678')
 702        '0801234XXXX'
 703        >>> MaskPersonalInformation()('連絡は075-123-4567 まで')
 704        '連絡は075-123-XXXX まで'
 705        >>> MaskPersonalInformation()('+81-80-1234-5678')
 706        '+81-80-1234-XXXX'
 707        >>> MaskPersonalInformation()('+818012345678')
 708        '+81801234XXXX'
 709        >>> MaskPersonalInformation()('hogehoge@example.com')
 710        'xxxx@yyy.com'
 711        >>> MaskPersonalInformation()('何かあれば hogehoge@example.ne.jp まで連絡')
 712        '何かあれば xxxx@yyy.jp まで連絡'
 713        """
 714        text = self.phone_pat.sub(r"\1XXXX", doc.text)
 715        text = self.email_pat.sub(r"xxxx@yyy\1", text)
 716        doc.text = text
 717        return doc
 718
 719
 720class DiscardTooManyNouns(Filter):
 721    """
 722    [!CAUTION] This filter requires `fugashi` package. Please install it
 723    by `pip install 'hojichar[all]'`.
 724
 725    A filter that removes document with too many nouns in Japanese i.e.,
 726    documents such as advertisement, word salad, etc ...
 727    """
 728
 729    def __init__(
 730        self, threshold: float = 0.80, max_parse_chars: int = 100_000, *args: Any, **kwargs: Any
 731    ) -> None:
 732        """
 733        Args:
 734            threshold: document whose noun ratio is higher than this value will be discarded
 735            max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document.
 736            *args:
 737            **kwargs:
 738        """
 739        super().__init__(*args, **kwargs)
 740        assert is_loaded_extras, (
 741            "fugashi is required for this filter. Try pip install 'hojichar[all]'"
 742        )
 743
 744        self.threshold = threshold
 745        self.max_parse_chars = max_parse_chars
 746        self.tagger = Tagger("-Owakati")
 747        assert "unidic" in self.tagger.dictionary_info[0]["filename"], (
 748            "MeCab dictionary must be unidic"
 749        )
 750
 751    def _chunk_text(self, text: str) -> Iterable[str]:
 752        """Slice text into chunks of `max_parse_chars` length."""
 753        step = self.max_parse_chars
 754        for i in range(0, len(text), step):
 755            yield text[i : i + step]
 756
 757    def apply(self, doc: Document) -> Document:
 758        """
 759        >>> DiscardTooManyNouns().apply(Document("自然言語処理大好き!")).is_rejected
 760        False
 761        >>> DiscardTooManyNouns().apply(Document("リンゴ・オレンジ・ミカン・バナナ セール中")).is_rejected
 762        True
 763        >>> DiscardTooManyNouns().apply(Document("今日の仙台朝市ではリンゴがセール中")).is_rejected
 764        False
 765        """
 766        # remove "補助記号" from part-of-speech statistics
 767        # because they often decrease the noun ratio,
 768        # e.g., the sentence "リンゴ・オレンジ・バナナ・" has 補助記号 ratio of 0.5
 769        # however, we don't want such sentence
 770
 771        pos_count: Counter[str] = Counter()
 772        for chunk in self._chunk_text(doc.text):
 773            for word in self.tagger(chunk):
 774                if word.feature.pos1 != "補助記号":
 775                    pos_count[word.feature.pos1] += 1
 776
 777        try:
 778            noun_ratio = pos_count["名詞"] / sum(pos_count.values())
 779        except ZeroDivisionError:
 780            noun_ratio = 0.0
 781        if noun_ratio >= self.threshold:
 782            doc.is_rejected = True
 783        return doc
 784
 785
 786class CharRepetitionRatioFilter(Filter):
 787    """
 788    文字Ngramの重なり率(文書中で高頻度文字Ngramが占める割合)を計算して, 重なりの大きいものを除去します.
 789    名詞の連続からなるような広告テキストを取り除くのに有効です.
 790
 791    実装は, BigScience で採用されていた前処理を参考にしています.
 792    元実装: https://github.com/bigscience-workshop/data-preparation/blob/9d0588419073cc5bf0fb92b58f37f2a1016572c3/preprocessing/training/01b_oscar_cleaning_and_filtering/filtering.py#L425-L453  # noqa: E501
 793
 794    「高頻度文字Ngram」は、sqrt(ユニークなNgramの総数)によって求めていますが,
 795    これは文書長の影響を軽減するためだとされています.
 796
 797    掲示板のテキストが引っかかりやすい傾向があります.
 798    13: 名無しさん@実況で競馬板アウト 2019/08/18(日) 15:28:46.10 ID:eBvZg8h+0
 799    的なものが高頻度で登場するため、文字Ngramの重なり率も高くなってしまう
 800    """
 801
 802    def __init__(
 803        self, threshold: float = 0.33, ngram_size: int = 5, *args: Any, **kwargs: Any
 804    ) -> None:
 805        """
 806
 807        Args:
 808            threshold: document with character repetition ratio higher than this value will be discarded
 809            ngram_size: character ngram size. Larger value will decrease the false positive of long documents
 810            *args:
 811            **kwargs:
 812        """  # noqa: E501
 813
 814        super().__init__(*args, **kwargs)
 815        self.threshold = threshold
 816        self.ngram_size = ngram_size
 817
 818    def apply(self, doc: Document) -> Document:
 819        ratio = self.compute_character_repetition_ratio(doc.text, self.ngram_size)
 820        if ratio >= self.threshold:
 821            doc.is_rejected = True
 822        return doc
 823
 824    @staticmethod
 825    def compute_character_repetition_ratio(
 826        document: str, character_repetition_length: int
 827    ) -> float:
 828        def get_freq_character_ngrams(document: str, n: int) -> Dict[str, int]:
 829            character_ngrams: List[str] = [
 830                document[i : i + n] for i in range(len(document) - n + 1)
 831            ]
 832            freq_character_ngrams_dict: Dict[str, int] = {}
 833            for character_ngram in character_ngrams:
 834                freq_character_ngrams_dict[character_ngram] = (
 835                    freq_character_ngrams_dict.get(character_ngram, 0) + 1
 836                )
 837            return freq_character_ngrams_dict
 838
 839        freq_character_ngrams_dict = get_freq_character_ngrams(
 840            document, character_repetition_length
 841        )
 842        if len(freq_character_ngrams_dict) == 0:
 843            return 0.0
 844        freq_character_ngrams: List[int] = list(freq_character_ngrams_dict.values())
 845        freq_character_ngrams = sorted(freq_character_ngrams, reverse=True)
 846        val_one = len([el for el in freq_character_ngrams if el == 1])
 847        num_rep_character_ngrams = min(
 848            int(np.sqrt(len(freq_character_ngrams))),
 849            len(freq_character_ngrams) - val_one,
 850        )
 851        character_repetition_ratio = sum(freq_character_ngrams[:num_rep_character_ngrams]) / sum(
 852            freq_character_ngrams
 853        )
 854        return character_repetition_ratio
 855
 856
 857class WordRepetitionRatioFilter(Filter):
 858    """
 859    [!CAUTION] This filter requires `fugashi` package. Please install it
 860    by `pip install 'hojichar[all]'`.
 861
 862    単語Ngramの重なり率(文書中で重複する単語Ngramが占める割合)を計算して、重なりの大きいものを弾くためのフィルタ.
 863    BigScienceで採用されていた前処理を参考にしている.
 864
 865    名詞が連打されているような広告テキストを取り除くのに有効な様子
 866    まともな文書がたまたま2回繰り返されている場合もあり、これを取り除いて良いのかは分からない
 867    例:
 868    "ウェブ\n本文: ニコンの上昇率16%超える、今3月期は経常76%の大幅増益見込む(ニコン) 2013年05月10日[minkabu PRESS] - みんなの株式 (みんかぶ)\n2013/05/10(10:57)
 869    ニコン<7731.T>が急騰、寄り付き直後に前日比355円高の2537円まで買い上げ
 870    られ、上昇率は16%を超えた。外国為替市場で円が1ドル100円台、1ユーロ131円台に入るなど急速に円安が進み、輸出株が軒並み高になる
 871    なか、9日取引終了後に発表した前年3月期決算で、今3月期は2ケタ近い増収で大幅増益を見込んだことが買い気を強めさせた。連結売上
 872    高は前期比9.8%増の1兆1100億円、経常利益75.8%増の850億円を予想。前期は半導体、電子部品の低迷が足かせになり、2ケタ増収ながら
 873    経常46%の大幅減益になったが、レンズ交換式デジタルカメラの拡大や液晶ディスプレイの回復で収益が急回復する。ニコンの株価は10時
 874    56分現在2491円(△309円)出所:株経通信(株式会社みんかぶ)\n2013/05/10 - ニコン(7731) の関連ニュース。 ニコン<7731.T>が急騰、寄
 875    り付き直後に前日比355円高の2537円まで買い上げられ、上昇率は16%を超えた。外国為替市場で円が1ドル100円台、1ユーロ131円台に入
 876    るなど急速に円安が進み、輸出株が軒並み高になるなか、9日取引終了後に発表した前年3月期決算で、今3月期は2ケタ近い増収で大幅増
 877    益を見込んだことが買い気を強めさせた。連結売上高は前期比9.8%増の1兆1100億円、経常利益75.8%増の850億円を予想。前期は半導体、
 878    電子部品の低迷が足かせになり、2ケタ増収ながら経常46%の大幅減益になったが、レンズ交換式デジタルカメラの拡大や液晶ディスプレ
 879    イの回復で収益が急回"
 880    """  # noqa: E501
 881
 882    def __init__(
 883        self,
 884        threshold: float = 0.40,
 885        ngram_size: int = 7,
 886        max_parse_chars: int = 100_000,
 887        *args: Any,
 888        **kwargs: Any,
 889    ) -> None:
 890        """
 891
 892        Args:
 893            threshold: document whose character repetition ratio is higher than this value will be discarded
 894            ngram_size: character ngram size. Larger value will decrease the false positive of long documents
 895            max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document.
 896            *args:
 897            **kwargs:
 898        """  # noqa: E501
 899        super().__init__(*args, **kwargs)
 900        assert is_loaded_extras, (
 901            "fugashi is required for this filter. Try pip install 'hojichar[all]'"
 902        )
 903
 904        self.threshold = threshold
 905        self.ngram_size = ngram_size
 906        self.max_parse_chars = max_parse_chars
 907        self.tagger = Tagger("-Owakati")
 908
 909    def _chunk_text(self, text: str) -> Iterable[str]:
 910        """Split text into chunks of `max_parse_chars` length."""
 911        step = self.max_parse_chars
 912        for i in range(0, len(text), step):
 913            yield text[i : i + step]
 914
 915    def _get_freq_word_ngrams(self, words: List[str], n: int) -> Dict[str, int]:
 916        freq: Dict[str, int] = {}
 917        if n <= 0 or len(words) < n:
 918            return freq
 919        for i in range(len(words) - n + 1):
 920            key = " ".join(words[i : i + n])
 921            freq[key] = freq.get(key, 0) + 1
 922        return freq
 923
 924    def apply(self, doc: Document) -> Document:
 925        ratio = self.compute_word_repetition_ratio(doc.text, self.ngram_size)
 926        if ratio >= self.threshold:
 927            doc.is_rejected = True
 928        return doc
 929
 930    def compute_word_repetition_ratio(self, document: str, n: int) -> float:
 931        total_counter: Counter[str] = Counter()
 932
 933        for chunk in self._chunk_text(document):
 934            words = [w.surface for w in self.tagger(chunk)]
 935            total_counter.update(self._get_freq_word_ngrams(words, n))
 936
 937        if not total_counter:
 938            return 0.0
 939
 940        total = sum(total_counter.values())
 941        repeated = sum(v for v in total_counter.values() if v > 1)
 942        return repeated / total
 943
 944
 945class DiscardTooManySpecialToken(Filter):
 946    """
 947    [!CAUTION] This filter requires `emoji` package. Please install it
 948    by `pip install 'hojichar[all]'`.
 949
 950    句読点を含む記号、空白、絵文字、その他特殊な文字を一定の割合以上含むような文書を取り除くためのフィルタ
 951    元実装: BigScience https://github.com/bigscience-workshop/data-preparation/blob/9d0588419073cc5bf0fb92b58f37f2a1016572c3/preprocessing/training/01b_oscar_cleaning_and_filtering/parameters_filtering.py#L5-L16  # noqa: E501
 952    """
 953
 954    def __init__(self, threshold: float = 0.4, *args: Any, **kwargs: Any) -> None:
 955        """
 956
 957        Args:
 958            threshold: document whose special token ratio is higher than this value will be discarded
 959            *args:
 960            **kwargs:
 961        """  # noqa: E501
 962        super().__init__(*args, **kwargs)
 963
 964        # digits are not regarded as special tokens
 965        # otherwise many false positives are made, i.e., good documents discarded
 966        main_special_characters = string.punctuation + string.whitespace  # + string.digits
 967        other_special_characters = (
 968            "’ “— ™ – •‘œ    ˜ ‚ƒ„’“”–▬…✦�­£​•€«»°·═"
 969            "×士^˘⇓()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰……‑≤≥‖"
 970            "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†:⁄♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚"
 971            "゜ʼ≖ʼ¤℃√!?【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖"
 972            "」﴾》�"
 973        )
 974
 975        en_emoji = emoji.EMOJI_DATA.keys()
 976
 977        special_characters_default = set(main_special_characters + other_special_characters)
 978        special_characters_default.update(en_emoji)
 979        self.special_characters = special_characters_default
 980
 981        self.threshold = threshold
 982
 983    def _compute_special_characters_ratio(self, text: str) -> float:
 984        if len(text) == 0:
 985            return 0
 986
 987        special_characters_ratio = len(
 988            [char for char in text if char in self.special_characters]
 989        ) / len(text)
 990        return special_characters_ratio
 991
 992    def apply(self, doc: Document) -> Document:
 993        special_characters_ratio = self._compute_special_characters_ratio(doc.text)
 994
 995        if special_characters_ratio > self.threshold:
 996            doc.is_rejected = True
 997        return doc
 998
 999
1000class SingleCharacterRepetitionFilter(Filter):
1001    """
1002    単一文字が大量に繰り返されているような文書を取り除くためのフィルタ
1003    そのような文書はノイズである可能性が高いため
1004    参考: BigScienceプロジェクトによると、oscarデータセットの中にバックスラッシュだけを2M個含むような文書が含まれていたらしい
1005    https://github.com/bigscience-workshop/bigscience/blob/master/train/tr8-104B-wide/chronicles.md#2m-backslash-only-samples-in-our-dataset  # noqa: E501
1006    """
1007
1008    def __init__(
1009        self,
1010        threshold: int = 200,
1011        *args: Any,
1012        **kwargs: Any,
1013    ) -> None:
1014        """
1015        Args:
1016            threshold: The document is removed if character is repeated for this value or more
1017            *args:
1018            **kwargs:
1019        """
1020        super().__init__(*args, **kwargs)
1021        self.threshold = threshold
1022
1023    def _is_repeat_contained(self, text: str) -> bool:
1024        groups = groupby(text)
1025        is_repeat_contained = any(sum(1 for _ in group) >= self.threshold for _, group in groups)
1026        return is_repeat_contained
1027
1028    def apply(self, doc: Document) -> Document:
1029        if self._is_repeat_contained(doc.text):
1030            doc.is_rejected = True
1031        return doc
1032
1033
1034class DiscardTooManyEndingEllipsis(Filter):
1035    """
1036    ellipsisで終わるような行が大量に含まれるような文書を取り除くためのフィルタです.
1037    ellipsisとしては ... と … を用いている
1038    同様のフィルタが RedPajama v2で用いられています.
1039
1040    例として, 以下のような文書を検知します.
1041    ```
1042    ペアーズは女性、という驚愕の過食が出ているのをごアラサーですか。時代から付...
1043    バツイチアラフォー 婚活ち女性の特徴と子持な付...
1044    ```
1045
1046    デフォルトではしきい値を0.7としているが, これはC4から0.1%を削るような設定であり、
1047    precisionを重視した設定です.
1048    """
1049
1050    def __init__(
1051        self,
1052        threshold: float = 0.7,
1053        *args: Any,
1054        **kwargs: Any,
1055    ) -> None:
1056        """
1057        Args:
1058            threshold: The document is removed if ratio of lines ending with ellipsis is higher than this value
1059            *args:
1060            **kwargs:
1061        """  # noqa: E501
1062        super().__init__(*args, **kwargs)
1063        self.threshold = threshold
1064        self.ellipsis_pattern = re.compile(r"(\.{3}|…)\n")  # matches ...\n and …\n
1065
1066    def apply(self, doc: Document) -> Document:
1067        ellipsis_count = len(self.ellipsis_pattern.findall(doc.text))
1068        newline_count = max(doc.text.count("\n"), 1)  # avoid zero division
1069        ellipsis_ratio = ellipsis_count / newline_count
1070
1071        if ellipsis_ratio > self.threshold:
1072            doc.is_rejected = True
1073        return doc
1074
1075
1076class DiscardTooShortLines(Filter):
1077    """
1078    短い行を大量に含む文書を捨てるためのフィルタです.
1079
1080    メニューバーやパンくずリストのような要素を大量に含む文書を取り除くのに有効です.
1081    """
1082
1083    def __init__(self, threshold: float = 0.5, *args: Any, **kwargs: Any) -> None:
1084        """
1085        Args:
1086            threshold: The document is removed if the ratio of short (<10 chars) lines are more than this value.
1087            *args:
1088            **kwargs:
1089        """  # noqa: E501
1090        super().__init__(*args, **kwargs)
1091        self.threshold = threshold
1092        # この値は適当に決め打ち
1093        self.minimum_line_length = 10
1094
1095    def apply(self, doc: Document) -> Document:
1096        lines = [len(x) for x in doc.text.split("\n")]
1097        short_lines = [x for x in lines if x <= self.minimum_line_length]
1098        if (len(short_lines) / len(lines)) > self.threshold:
1099            doc.is_rejected = True
1100        return doc
class ExampleHojiChar(hojichar.core.filter_interface.Filter):
32class ExampleHojiChar(Filter):
33    """基本的なフィルタの実装例です. 末尾に'<hojichar>'を追加します."""
34
35    def apply(self, document: Document) -> Document:
36        """
37        >>> ExampleHojiChar()("hello, world")
38        'hello, world<hojichar>'
39        """
40        document.text += "<hojichar>"
41        return document

基本的なフィルタの実装例です. 末尾に''を追加します.

def apply( self, document: hojichar.core.models.Document) -> hojichar.core.models.Document:
35    def apply(self, document: Document) -> Document:
36        """
37        >>> ExampleHojiChar()("hello, world")
38        'hello, world<hojichar>'
39        """
40        document.text += "<hojichar>"
41        return document
>>> ExampleHojiChar()("hello, world")
'hello, world<hojichar>'
class ExampleDiscardDocumentContainKeyword(hojichar.core.filter_interface.Filter):
44class ExampleDiscardDocumentContainKeyword(Filter):
45    """特定のキーワードを持つドキュメントを破棄するようなフィルタの実装例です."""
46
47    def __init__(self, keyword: str, *args: Any, **kwargs: Any) -> None:
48        super().__init__(*args, **kwargs)
49        self.keyword = keyword
50
51    def apply(self, document: Document) -> Document:
52        """
53        >>> ExampleDiscardDocumentContainKeyword("バカ").apply(Document("あいつはバカだ")).is_rejected
54        True
55        """
56        if self.keyword in document.text:
57            document.is_rejected = True
58        return document

特定のキーワードを持つドキュメントを破棄するようなフィルタの実装例です.

ExampleDiscardDocumentContainKeyword(keyword: str, *args: Any, **kwargs: Any)
47    def __init__(self, keyword: str, *args: Any, **kwargs: Any) -> None:
48        super().__init__(*args, **kwargs)
49        self.keyword = keyword

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, document: hojichar.core.models.Document) -> hojichar.core.models.Document:
51    def apply(self, document: Document) -> Document:
52        """
53        >>> ExampleDiscardDocumentContainKeyword("バカ").apply(Document("あいつはバカだ")).is_rejected
54        True
55        """
56        if self.keyword in document.text:
57            document.is_rejected = True
58        return document
>>> ExampleDiscardDocumentContainKeyword("バカ").apply(Document("あいつはバカだ")).is_rejected
True
class Identity(hojichar.core.filter_interface.Filter):
61class Identity(Filter):
62    """何も変化を加えないフィルタです. テスト・デバッグに用いられます."""
63
64    def apply(self, document: Document) -> Document:
65        return document

何も変化を加えないフィルタです. テスト・デバッグに用いられます.

def apply( self, document: hojichar.core.models.Document) -> hojichar.core.models.Document:
64    def apply(self, document: Document) -> Document:
65        return document

Definition of filter behavior.

The document must have a protocol TextContent, and mostly used hojichar.Document class.

In this method, the filter will modify document.text or document.extras and set document.is_rejected = True to discard the document.

Parameters

document : Document Input document

Returns

Document Processed Document

class DiscardAll(hojichar.core.filter_interface.Filter):
68class DiscardAll(Filter):
69    """
70    すべてのドキュメントを破棄するフィルタです.
71    テスト・デバッグに用いられます.
72    """
73
74    def apply(self, document: Document) -> Document:
75        document.is_rejected = True
76        return document

すべてのドキュメントを破棄するフィルタです. テスト・デバッグに用いられます.

def apply( self, document: hojichar.core.models.Document) -> hojichar.core.models.Document:
74    def apply(self, document: Document) -> Document:
75        document.is_rejected = True
76        return document

Definition of filter behavior.

The document must have a protocol TextContent, and mostly used hojichar.Document class.

In this method, the filter will modify document.text or document.extras and set document.is_rejected = True to discard the document.

Parameters

document : Document Input document

Returns

Document Processed Document

class ApplyDiscard(hojichar.core.filter_interface.Filter):
 79class ApplyDiscard(Filter):
 80    """
 81    上流フィルタで破棄された`Document`を空文字列にします.
 82
 83    `Document.is_rejected=True` の ドキュメントは無視されるため,
 84    このフィルタを `Compose` のコンストラクタに渡しても動作しません.
 85    このフィルタは主に`Compose` 内部や, `discard_filtered=False` を指定
 86    したデバッグ時などに利用されます.
 87    """
 88
 89    def __init__(self, *args: Any, **kwargs: Any) -> None:
 90        super().__init__(*args, **kwargs)
 91
 92    def apply(self, document: Document) -> Document:
 93        """
 94        >>> ApplyDiscard().apply(Document(text="hello", is_rejected=True)).text
 95        ''
 96        """
 97        if document.is_rejected:
 98            document.text = ""
 99
100        return document

上流フィルタで破棄されたDocumentを空文字列にします.

Document.is_rejected=True の ドキュメントは無視されるため, このフィルタを Compose のコンストラクタに渡しても動作しません. このフィルタは主にCompose 内部や, discard_filtered=False を指定 したデバッグ時などに利用されます.

ApplyDiscard(*args: Any, **kwargs: Any)
89    def __init__(self, *args: Any, **kwargs: Any) -> None:
90        super().__init__(*args, **kwargs)

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, document: hojichar.core.models.Document) -> hojichar.core.models.Document:
 92    def apply(self, document: Document) -> Document:
 93        """
 94        >>> ApplyDiscard().apply(Document(text="hello", is_rejected=True)).text
 95        ''
 96        """
 97        if document.is_rejected:
 98            document.text = ""
 99
100        return document
>>> ApplyDiscard().apply(Document(text="hello", is_rejected=True)).text
''
class Sleep(hojichar.core.filter_interface.Filter):
103class Sleep(Filter):
104    """
105    デバッグ用のフィルタです. 指定秒スリープします.
106    """
107
108    def __init__(self, time: float = 1.0, *args: Any, **kwargs: Any) -> None:
109        super().__init__(*args, **kwargs)
110        self.time = time
111
112    def apply(self, document: Document) -> Document:
113        """
114        >>> Sleep(0.1)('hello')  # After 0.1 seconds,
115        'hello'
116        """
117        time.sleep(self.time)
118        return document

デバッグ用のフィルタです. 指定秒スリープします.

Sleep(time: float = 1.0, *args: Any, **kwargs: Any)
108    def __init__(self, time: float = 1.0, *args: Any, **kwargs: Any) -> None:
109        super().__init__(*args, **kwargs)
110        self.time = time

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, document: hojichar.core.models.Document) -> hojichar.core.models.Document:
112    def apply(self, document: Document) -> Document:
113        """
114        >>> Sleep(0.1)('hello')  # After 0.1 seconds,
115        'hello'
116        """
117        time.sleep(self.time)
118        return document
>>> Sleep(0.1)('hello')  # After 0.1 seconds,
'hello'
class DocumentNormalizer(hojichar.core.filter_interface.Filter):
121class DocumentNormalizer(Filter):
122    """
123    Unicode の正規化をします.
124    """
125
126    def __init__(self, *args: Any, **kwargs: Any) -> None:
127        super().__init__(*args, **kwargs)
128
129    def apply(self, document: Document) -> Document:
130        document.text = unicodedata.normalize("NFKC", document.text)
131        return document

Unicode の正規化をします.

DocumentNormalizer(*args: Any, **kwargs: Any)
126    def __init__(self, *args: Any, **kwargs: Any) -> None:
127        super().__init__(*args, **kwargs)

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, document: hojichar.core.models.Document) -> hojichar.core.models.Document:
129    def apply(self, document: Document) -> Document:
130        document.text = unicodedata.normalize("NFKC", document.text)
131        return document

Definition of filter behavior.

The document must have a protocol TextContent, and mostly used hojichar.Document class.

In this method, the filter will modify document.text or document.extras and set document.is_rejected = True to discard the document.

Parameters

document : Document Input document

Returns

Document Processed Document

class JSONLoader(hojichar.core.filter_interface.Filter):
134class JSONLoader(Filter):
135    """
136    テキストを Json として解釈し, `key` で指定した要素を文字列として
137    doument に格納します.デフォルトの `key` は 'text' です.
138
139    Json の読み込み, あるいは `key` の読み込みに失敗した際には例外を送出します.
140    これらを無視する場合は, `ignore=True` にします. その際, 読み込みに失敗
141    したドキュメントは破棄されます.
142    """
143
144    def __init__(
145        self,
146        key: str = "text",
147        ignore: bool = False,
148        extra_keys: Optional[List[str]] = None,
149        *args: Any,
150        **kwargs: Any,
151    ) -> None:
152        super().__init__(*args, **kwargs)
153        self.key = key
154        self.ignore = ignore
155        self.extra_keys = extra_keys
156
157    def apply(self, document: Document) -> Document:
158        """
159        >>> JSONLoader()( '{"text": "hello, world", "words": 2}' )
160        'hello, world'
161
162        >>> JSONLoader()( '{"text": hello, world ....' ) # Broken JSON
163        Traceback (most recent call last):
164            ...
165        json.decoder.JSONDecodeError: Expecting value: line 1 column 10 (char 9)
166
167        >>> JSONLoader()( '{"words": 2}' )
168        Traceback (most recent call last):
169            ...
170        KeyError: 'text'
171
172        >>> JSONLoader(ignore=True).apply(Document('{"text": hello, world ....' )).is_rejected
173        True
174        """
175        try:
176            data = json.loads(document.text)
177            document.text = str(data[self.key])
178            if self.extra_keys is not None:
179                document.extras = {key: data[key] for key in self.extra_keys if key in data}
180        except Exception as e:
181            logger.error(f"Failed to parsing in JSONLoader. Input document: \n{document.text}")
182            if self.ignore:
183                document.is_rejected = True
184                return document
185            else:
186                raise e
187
188        return document

テキストを Json として解釈し, key で指定した要素を文字列として doument に格納します.デフォルトの key は 'text' です.

Json の読み込み, あるいは key の読み込みに失敗した際には例外を送出します. これらを無視する場合は, ignore=True にします. その際, 読み込みに失敗 したドキュメントは破棄されます.

JSONLoader( key: str = 'text', ignore: bool = False, extra_keys: Optional[List[str]] = None, *args: Any, **kwargs: Any)
144    def __init__(
145        self,
146        key: str = "text",
147        ignore: bool = False,
148        extra_keys: Optional[List[str]] = None,
149        *args: Any,
150        **kwargs: Any,
151    ) -> None:
152        super().__init__(*args, **kwargs)
153        self.key = key
154        self.ignore = ignore
155        self.extra_keys = extra_keys

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, document: hojichar.core.models.Document) -> hojichar.core.models.Document:
157    def apply(self, document: Document) -> Document:
158        """
159        >>> JSONLoader()( '{"text": "hello, world", "words": 2}' )
160        'hello, world'
161
162        >>> JSONLoader()( '{"text": hello, world ....' ) # Broken JSON
163        Traceback (most recent call last):
164            ...
165        json.decoder.JSONDecodeError: Expecting value: line 1 column 10 (char 9)
166
167        >>> JSONLoader()( '{"words": 2}' )
168        Traceback (most recent call last):
169            ...
170        KeyError: 'text'
171
172        >>> JSONLoader(ignore=True).apply(Document('{"text": hello, world ....' )).is_rejected
173        True
174        """
175        try:
176            data = json.loads(document.text)
177            document.text = str(data[self.key])
178            if self.extra_keys is not None:
179                document.extras = {key: data[key] for key in self.extra_keys if key in data}
180        except Exception as e:
181            logger.error(f"Failed to parsing in JSONLoader. Input document: \n{document.text}")
182            if self.ignore:
183                document.is_rejected = True
184                return document
185            else:
186                raise e
187
188        return document
>>> JSONLoader()( '{"text": "hello, world", "words": 2}' )
'hello, world'
>>> JSONLoader()( '{"text": hello, world ....' ) # Broken JSON
Traceback (most recent call last):
    ...
json.decoder.JSONDecodeError: Expecting value: line 1 column 10 (char 9)
>>> JSONLoader()( '{"words": 2}' )
Traceback (most recent call last):
    ...
KeyError: 'text'
>>> JSONLoader(ignore=True).apply(Document('{"text": hello, world ....' )).is_rejected
True
class JSONDumper(hojichar.core.filter_interface.Filter):
191class JSONDumper(Filter):
192    """
193    Document.text の文字列を json に変換します.
194    必要に応じ Document のメタデータを付与します. これはドキュメントの破棄事由が含まれ、偽陽性の分析に有効です。
195    デフォルトで `skip_rejected` が `False` にセットされており、Document の破棄フラグにかかわらず
196    処理されます。
197    """
198
199    def __init__(
200        self,
201        dump_reason: bool = False,
202        p: float = 1,
203        skip_rejected: bool = False,
204        export_extras: bool = False,
205        *args: Any,
206        **kwargs: Any,
207    ) -> None:
208        """
209        Args:
210            dump_reason (bool, optional): `is_rejected`, `reason` エントリをダンプします. Defaults to False.
211            p (float, optional): Apply probability. Defaults to 1.
212            skip_rejected (bool, optional): 破棄済みサンプルを排除しません.
213        """
214        super().__init__(p, skip_rejected, *args, **kwargs)
215        self.dump_reason = dump_reason
216        self.export_extras = export_extras
217
218    def apply(self, document: Document) -> Document:
219        """
220        >>> JSONDumper()("hojichar")
221        '{"text": "hojichar"}'
222        """
223        text = document.text
224        if self.dump_reason:
225            if self.export_extras:
226                output_extras = {k: v for k, v in document.extras.items() if k != "__init_stats"}
227                document.text = json.dumps(
228                    {
229                        "text": text,
230                        "is_rejected": document.is_rejected,
231                        "reason": document.reject_reason,
232                        "extras": output_extras,
233                    },
234                    ensure_ascii=False,
235                )
236            else:
237                document.text = json.dumps(
238                    {
239                        "text": text,
240                        "is_rejected": document.is_rejected,
241                        "reason": document.reject_reason,
242                    },
243                    ensure_ascii=False,
244                )
245        else:
246            if self.export_extras:
247                output_extras = {k: v for k, v in document.extras.items() if k != "__init_stats"}
248                document.text = json.dumps(
249                    {
250                        "text": text,
251                        "extras": output_extras,
252                    },
253                    ensure_ascii=False,
254                )
255            else:
256                document.text = json.dumps({"text": text}, ensure_ascii=False)
257        return document

Document.text の文字列を json に変換します. 必要に応じ Document のメタデータを付与します. これはドキュメントの破棄事由が含まれ、偽陽性の分析に有効です。 デフォルトで skip_rejectedFalse にセットされており、Document の破棄フラグにかかわらず 処理されます。

JSONDumper( dump_reason: bool = False, p: float = 1, skip_rejected: bool = False, export_extras: bool = False, *args: Any, **kwargs: Any)
199    def __init__(
200        self,
201        dump_reason: bool = False,
202        p: float = 1,
203        skip_rejected: bool = False,
204        export_extras: bool = False,
205        *args: Any,
206        **kwargs: Any,
207    ) -> None:
208        """
209        Args:
210            dump_reason (bool, optional): `is_rejected`, `reason` エントリをダンプします. Defaults to False.
211            p (float, optional): Apply probability. Defaults to 1.
212            skip_rejected (bool, optional): 破棄済みサンプルを排除しません.
213        """
214        super().__init__(p, skip_rejected, *args, **kwargs)
215        self.dump_reason = dump_reason
216        self.export_extras = export_extras

Args: dump_reason (bool, optional): is_rejected, reason エントリをダンプします. Defaults to False. p (float, optional): Apply probability. Defaults to 1. skip_rejected (bool, optional): 破棄済みサンプルを排除しません.

def apply( self, document: hojichar.core.models.Document) -> hojichar.core.models.Document:
218    def apply(self, document: Document) -> Document:
219        """
220        >>> JSONDumper()("hojichar")
221        '{"text": "hojichar"}'
222        """
223        text = document.text
224        if self.dump_reason:
225            if self.export_extras:
226                output_extras = {k: v for k, v in document.extras.items() if k != "__init_stats"}
227                document.text = json.dumps(
228                    {
229                        "text": text,
230                        "is_rejected": document.is_rejected,
231                        "reason": document.reject_reason,
232                        "extras": output_extras,
233                    },
234                    ensure_ascii=False,
235                )
236            else:
237                document.text = json.dumps(
238                    {
239                        "text": text,
240                        "is_rejected": document.is_rejected,
241                        "reason": document.reject_reason,
242                    },
243                    ensure_ascii=False,
244                )
245        else:
246            if self.export_extras:
247                output_extras = {k: v for k, v in document.extras.items() if k != "__init_stats"}
248                document.text = json.dumps(
249                    {
250                        "text": text,
251                        "extras": output_extras,
252                    },
253                    ensure_ascii=False,
254                )
255            else:
256                document.text = json.dumps({"text": text}, ensure_ascii=False)
257        return document
>>> JSONDumper()("hojichar")
'{"text": "hojichar"}'
class DocumentLengthFilter(hojichar.core.filter_interface.Filter):
260class DocumentLengthFilter(Filter):
261    """
262    `min_doc_len`, `max_doc_len` で指定した上限・下限の範囲内にないドキュメントを破棄します.
263    デフォルトでは 200字 以上 50000字以内のテキストが受理されます.
264    """
265
266    def __init__(
267        self,
268        min_doc_len: Optional[int] = None,
269        max_doc_len: Optional[int] = None,
270        *args: Any,
271        **kwargs: Any,
272    ) -> None:
273        super().__init__(*args, **kwargs)
274
275        self.min_doc_len = min_doc_len
276        self.max_doc_len = max_doc_len
277
278    def apply(self, doc: Document) -> Document:
279        """
280        >>> DocumentLengthFilter(min_doc_len=5).apply(Document("1234")).is_rejected
281        True
282        """
283        doc_len = len(doc.text)
284        if self.min_doc_len is not None:
285            if doc_len < self.min_doc_len:
286                doc.is_rejected = True
287        if self.max_doc_len is not None:
288            if self.max_doc_len < doc_len:
289                doc.is_rejected = True
290        return doc

min_doc_len, max_doc_len で指定した上限・下限の範囲内にないドキュメントを破棄します. デフォルトでは 200字 以上 50000字以内のテキストが受理されます.

DocumentLengthFilter( min_doc_len: Optional[int] = None, max_doc_len: Optional[int] = None, *args: Any, **kwargs: Any)
266    def __init__(
267        self,
268        min_doc_len: Optional[int] = None,
269        max_doc_len: Optional[int] = None,
270        *args: Any,
271        **kwargs: Any,
272    ) -> None:
273        super().__init__(*args, **kwargs)
274
275        self.min_doc_len = min_doc_len
276        self.max_doc_len = max_doc_len

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
278    def apply(self, doc: Document) -> Document:
279        """
280        >>> DocumentLengthFilter(min_doc_len=5).apply(Document("1234")).is_rejected
281        True
282        """
283        doc_len = len(doc.text)
284        if self.min_doc_len is not None:
285            if doc_len < self.min_doc_len:
286                doc.is_rejected = True
287        if self.max_doc_len is not None:
288            if self.max_doc_len < doc_len:
289                doc.is_rejected = True
290        return doc
>>> DocumentLengthFilter(min_doc_len=5).apply(Document("1234")).is_rejected
True
class NgWordsFilterJa(hojichar.core.filter_interface.Filter):
293class NgWordsFilterJa(Filter):
294    """
295    日本語のNGワード(および不適切語)を含む文書を破棄します.
296    `dict_path` で指定したファイルから, キーワードのリストを得ます.
297    ファイルは単語が改行で羅列されたテキストファイルです.
298
299    `ignore_confused` を `True` にすると,
300    偽陽性を軽減するために, カタカナのNGワードは前後にカタカナが無い場合のみNG判定されます.
301    デフォルト値は `False` です.
302    """
303
304    def __init__(
305        self,
306        dict_path: Union[str, PathLike],
307        ignore_confused: bool = False,
308        *args: Any,
309        **kwargs: Any,
310    ) -> None:
311        super().__init__(*args, **kwargs)
312
313        with open(dict_path, encoding="utf-8") as fp:
314            ng_words = fp.read().split("\n")
315        ng_words = [w.strip() for w in ng_words if not len(w) == 0]
316
317        if ignore_confused:
318            words_katakana = []
319            words_not_katakana = []
320            for w in ng_words:
321                if re.fullmatch(r"[ァ-ヴー]+", w):
322                    words_katakana.append(re.escape(w))
323                else:
324                    words_not_katakana.append(re.escape(w))
325            katakana_pat = "|".join(words_katakana)
326            katakana_pat = rf"(?<![ァ-ヴー])({katakana_pat})(?![ァ-ヴー])"
327            pat = "|".join(words_not_katakana) + "|" + katakana_pat
328            self.keyword_pat = re.compile(pat)
329        else:
330            ng_words = [re.escape(w) for w in ng_words]
331            pat = "|".join(ng_words)
332            self.keyword_pat = re.compile(pat)
333
334    def apply(self, doc: Document) -> Document:
335        regex_match = self.keyword_pat.search(doc.text)
336        if regex_match:
337            doc.is_rejected = True
338            self.matched_text = regex_match.group()
339            self.matched_text_neighbor = doc.text[
340                regex_match.start() - 20 : regex_match.end() + 20
341            ]
342
343        return doc

日本語のNGワード(および不適切語)を含む文書を破棄します. dict_path で指定したファイルから, キーワードのリストを得ます. ファイルは単語が改行で羅列されたテキストファイルです.

ignore_confusedTrue にすると, 偽陽性を軽減するために, カタカナのNGワードは前後にカタカナが無い場合のみNG判定されます. デフォルト値は False です.

NgWordsFilterJa( dict_path: Union[str, os.PathLike], ignore_confused: bool = False, *args: Any, **kwargs: Any)
304    def __init__(
305        self,
306        dict_path: Union[str, PathLike],
307        ignore_confused: bool = False,
308        *args: Any,
309        **kwargs: Any,
310    ) -> None:
311        super().__init__(*args, **kwargs)
312
313        with open(dict_path, encoding="utf-8") as fp:
314            ng_words = fp.read().split("\n")
315        ng_words = [w.strip() for w in ng_words if not len(w) == 0]
316
317        if ignore_confused:
318            words_katakana = []
319            words_not_katakana = []
320            for w in ng_words:
321                if re.fullmatch(r"[ァ-ヴー]+", w):
322                    words_katakana.append(re.escape(w))
323                else:
324                    words_not_katakana.append(re.escape(w))
325            katakana_pat = "|".join(words_katakana)
326            katakana_pat = rf"(?<![ァ-ヴー])({katakana_pat})(?![ァ-ヴー])"
327            pat = "|".join(words_not_katakana) + "|" + katakana_pat
328            self.keyword_pat = re.compile(pat)
329        else:
330            ng_words = [re.escape(w) for w in ng_words]
331            pat = "|".join(ng_words)
332            self.keyword_pat = re.compile(pat)

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
334    def apply(self, doc: Document) -> Document:
335        regex_match = self.keyword_pat.search(doc.text)
336        if regex_match:
337            doc.is_rejected = True
338            self.matched_text = regex_match.group()
339            self.matched_text_neighbor = doc.text[
340                regex_match.start() - 20 : regex_match.end() + 20
341            ]
342
343        return doc

Definition of filter behavior.

The document must have a protocol TextContent, and mostly used hojichar.Document class.

In this method, the filter will modify document.text or document.extras and set document.is_rejected = True to discard the document.

Parameters

document : Document Input document

Returns

Document Processed Document

class NgWordsFilterEn(hojichar.core.filter_interface.Filter):
346class NgWordsFilterEn(Filter):
347    """
348    英語のNGワード(および不適切語)を含む文書を破棄します.
349    `dict_path` で指定したファイルから, キーワードのリストを得ます.
350    ファイルは単語が改行で羅列されたテキストファイルです.
351    """
352
353    def __init__(self, dict_path: Union[str, PathLike], *args: Any, **kwargs: Any) -> None:
354        super().__init__(*args, **kwargs)
355
356        with open(dict_path, encoding="utf-8") as fp:
357            ng_words = fp.read().split("\n")
358        ng_words = [re.escape(w.strip()) for w in ng_words if not len(w) == 0]
359        pat = "|".join(ng_words)
360        # 英語のパターンにマッチするようにしている, \s[単語]\s や [単語]. [単語], などにマッチ.
361        self.keyword_pat = re.compile(rf"(?:^| )({pat})(?:( |,|\.)|$)", re.IGNORECASE)
362
363    def apply(self, doc: Document) -> Document:
364        if self.keyword_pat.search(doc.text):
365            doc.is_rejected = True
366        return doc

英語のNGワード(および不適切語)を含む文書を破棄します. dict_path で指定したファイルから, キーワードのリストを得ます. ファイルは単語が改行で羅列されたテキストファイルです.

NgWordsFilterEn(dict_path: Union[str, os.PathLike], *args: Any, **kwargs: Any)
353    def __init__(self, dict_path: Union[str, PathLike], *args: Any, **kwargs: Any) -> None:
354        super().__init__(*args, **kwargs)
355
356        with open(dict_path, encoding="utf-8") as fp:
357            ng_words = fp.read().split("\n")
358        ng_words = [re.escape(w.strip()) for w in ng_words if not len(w) == 0]
359        pat = "|".join(ng_words)
360        # 英語のパターンにマッチするようにしている, \s[単語]\s や [単語]. [単語], などにマッチ.
361        self.keyword_pat = re.compile(rf"(?:^| )({pat})(?:( |,|\.)|$)", re.IGNORECASE)

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
363    def apply(self, doc: Document) -> Document:
364        if self.keyword_pat.search(doc.text):
365            doc.is_rejected = True
366        return doc

Definition of filter behavior.

The document must have a protocol TextContent, and mostly used hojichar.Document class.

In this method, the filter will modify document.text or document.extras and set document.is_rejected = True to discard the document.

Parameters

document : Document Input document

Returns

Document Processed Document

class DiscardAdultContentJa(NgWordsFilterJa):
369class DiscardAdultContentJa(NgWordsFilterJa):
370    """
371    日本語のアダルトキーワード(および不適切語)を含む文書を破棄します.
372    `dict_path` で指定したファイルから, キーワードのリストを得ます.
373    ファイルは単語が改行で羅列されたテキストファイルです.
374    デフォルトの`dict_path` は /hojichar/dict/adult_keywords_ja.txt です.
375    """
376
377    def __init__(
378        self,
379        dict_path: Union[str, PathLike] = BASE_PATH / "dict/adult_keywords_ja.txt",
380        *args: Any,
381        **kwargs: Any,
382    ) -> None:
383        super().__init__(dict_path, *args, **kwargs)
384
385    def apply(self, doc: Document) -> Document:
386        """
387        >>> DiscardAdultContentJa().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected
388        True
389
390        >>> DiscardAdultContentJa().apply(Document("ほうじ茶")).is_rejected
391        False
392
393        挙動は正しいが誤検知しているケース. 他にも, サック in リュックサック,
394        >>> DiscardAdultContentJa().apply(Document("アスパラガス")).is_rejected \
395        # Matching with NG keyword "アス"
396        True
397        """
398        return super().apply(doc)

日本語のアダルトキーワード(および不適切語)を含む文書を破棄します. dict_path で指定したファイルから, キーワードのリストを得ます. ファイルは単語が改行で羅列されたテキストファイルです. デフォルトのdict_path は /hojichar/dict/adult_keywords_ja.txt です.

DiscardAdultContentJa( dict_path: Union[str, os.PathLike] = PosixPath('/home/runner/work/HojiChar/HojiChar/hojichar/dict/adult_keywords_ja.txt'), *args: Any, **kwargs: Any)
377    def __init__(
378        self,
379        dict_path: Union[str, PathLike] = BASE_PATH / "dict/adult_keywords_ja.txt",
380        *args: Any,
381        **kwargs: Any,
382    ) -> None:
383        super().__init__(dict_path, *args, **kwargs)

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
385    def apply(self, doc: Document) -> Document:
386        """
387        >>> DiscardAdultContentJa().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected
388        True
389
390        >>> DiscardAdultContentJa().apply(Document("ほうじ茶")).is_rejected
391        False
392
393        挙動は正しいが誤検知しているケース. 他にも, サック in リュックサック,
394        >>> DiscardAdultContentJa().apply(Document("アスパラガス")).is_rejected \
395        # Matching with NG keyword "アス"
396        True
397        """
398        return super().apply(doc)
>>> DiscardAdultContentJa().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected
True
>>> DiscardAdultContentJa().apply(Document("ほうじ茶")).is_rejected
False

挙動は正しいが誤検知しているケース. 他にも, サック in リュックサック,

>>> DiscardAdultContentJa().apply(Document("アスパラガス")).is_rejected         # Matching with NG keyword "アス"
True
class DiscardAdultContentEn(NgWordsFilterEn):
401class DiscardAdultContentEn(NgWordsFilterEn):
402    """
403    英語のアダルトキーワード(および不適切語)を含む文書を破棄します.
404    `dict_path` で指定したファイルから, キーワードのリストを得ます.
405    ファイルは単語が改行で羅列されたテキストファイルです.
406    デフォルトの`dict_path` は /hojichar/dict/adult_keywords_en.txt です.
407    """
408
409    def __init__(
410        self,
411        dict_path: Union[str, PathLike] = BASE_PATH / "dict/adult_keywords_en.txt",
412        *args: Any,
413        **kwargs: Any,
414    ) -> None:
415        super().__init__(dict_path, *args, **kwargs)
416
417    def apply(self, doc: Document) -> Document:
418        """
419        >>> DiscardAdultContentEn().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected
420        True
421
422        >>> DiscardAdultContentEn().apply(Document("hojichar")).is_rejected
423        False
424        """
425        return super().apply(doc)

英語のアダルトキーワード(および不適切語)を含む文書を破棄します. dict_path で指定したファイルから, キーワードのリストを得ます. ファイルは単語が改行で羅列されたテキストファイルです. デフォルトのdict_path は /hojichar/dict/adult_keywords_en.txt です.

DiscardAdultContentEn( dict_path: Union[str, os.PathLike] = PosixPath('/home/runner/work/HojiChar/HojiChar/hojichar/dict/adult_keywords_en.txt'), *args: Any, **kwargs: Any)
409    def __init__(
410        self,
411        dict_path: Union[str, PathLike] = BASE_PATH / "dict/adult_keywords_en.txt",
412        *args: Any,
413        **kwargs: Any,
414    ) -> None:
415        super().__init__(dict_path, *args, **kwargs)

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
417    def apply(self, doc: Document) -> Document:
418        """
419        >>> DiscardAdultContentEn().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected
420        True
421
422        >>> DiscardAdultContentEn().apply(Document("hojichar")).is_rejected
423        False
424        """
425        return super().apply(doc)
>>> DiscardAdultContentEn().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected
True
>>> DiscardAdultContentEn().apply(Document("hojichar")).is_rejected
False
class DiscardDiscriminationContentJa(NgWordsFilterJa):
428class DiscardDiscriminationContentJa(NgWordsFilterJa):
429    """
430    日本語の差別キーワード(および不適切語)を含む文書を破棄します.
431    `dict_path` で指定したファイルから, キーワードのリストを得ます.
432    ファイルは単語が改行で羅列されたテキストファイルです.
433    デフォルトの`dict_path` は /hojichar/dict/discrimination_keywords_ja.txt です.
434    """
435
436    def __init__(
437        self,
438        dict_path: Union[str, PathLike] = BASE_PATH / "dict/discrimination_keywords_ja.txt",
439        *args: Any,
440        **kwargs: Any,
441    ):
442        super().__init__(dict_path, *args, **kwargs)
443
444    def apply(self, doc: Document) -> Document:
445        """
446        >>> DiscardDiscriminationContentJa().\
447            apply(Document("<TEST_STRING_OF_DISCRIMINATION_KEYWORD>")).is_rejected
448        True
449
450        >>> DiscardDiscriminationContentJa().apply(Document("ほうじ茶")).is_rejected
451        False
452        """
453        return super().apply(doc)

日本語の差別キーワード(および不適切語)を含む文書を破棄します. dict_path で指定したファイルから, キーワードのリストを得ます. ファイルは単語が改行で羅列されたテキストファイルです. デフォルトのdict_path は /hojichar/dict/discrimination_keywords_ja.txt です.

DiscardDiscriminationContentJa( dict_path: Union[str, os.PathLike] = PosixPath('/home/runner/work/HojiChar/HojiChar/hojichar/dict/discrimination_keywords_ja.txt'), *args: Any, **kwargs: Any)
436    def __init__(
437        self,
438        dict_path: Union[str, PathLike] = BASE_PATH / "dict/discrimination_keywords_ja.txt",
439        *args: Any,
440        **kwargs: Any,
441    ):
442        super().__init__(dict_path, *args, **kwargs)

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
444    def apply(self, doc: Document) -> Document:
445        """
446        >>> DiscardDiscriminationContentJa().\
447            apply(Document("<TEST_STRING_OF_DISCRIMINATION_KEYWORD>")).is_rejected
448        True
449
450        >>> DiscardDiscriminationContentJa().apply(Document("ほうじ茶")).is_rejected
451        False
452        """
453        return super().apply(doc)
>>> DiscardDiscriminationContentJa().            apply(Document("<TEST_STRING_OF_DISCRIMINATION_KEYWORD>")).is_rejected
True
>>> DiscardDiscriminationContentJa().apply(Document("ほうじ茶")).is_rejected
False
class DiscardViolenceContentJa(NgWordsFilterJa):
456class DiscardViolenceContentJa(NgWordsFilterJa):
457    """
458    日本語の暴力・脅迫を示唆するキーワードを含む文書を破棄します.
459    `dict_path` で指定したファイルから, キーワードのリストを得ます.
460    ファイルは単語が改行で羅列されたテキストファイルです.
461    デフォルトの`dict_path` は /hojichar/dict/violence_keywords_ja.txt です.
462    """
463
464    def __init__(
465        self,
466        dict_path: Union[str, PathLike] = BASE_PATH / "dict/violence_keywords_ja.txt",
467        *args: Any,
468        **kwargs: Any,
469    ) -> None:
470        super().__init__(dict_path, *args, **kwargs)
471
472    def apply(self, doc: Document) -> Document:
473        """
474        >>> DiscardViolenceContentJa()\
475            .apply(Document("<TEST_STRING_OF_VIOLENCE_KEYWORD>")).is_rejected
476        True
477
478        >>> DiscardViolenceContentJa().apply(Document("ほうじ茶")).is_rejected
479        False
480        """
481        return super().apply(doc)

日本語の暴力・脅迫を示唆するキーワードを含む文書を破棄します. dict_path で指定したファイルから, キーワードのリストを得ます. ファイルは単語が改行で羅列されたテキストファイルです. デフォルトのdict_path は /hojichar/dict/violence_keywords_ja.txt です.

DiscardViolenceContentJa( dict_path: Union[str, os.PathLike] = PosixPath('/home/runner/work/HojiChar/HojiChar/hojichar/dict/violence_keywords_ja.txt'), *args: Any, **kwargs: Any)
464    def __init__(
465        self,
466        dict_path: Union[str, PathLike] = BASE_PATH / "dict/violence_keywords_ja.txt",
467        *args: Any,
468        **kwargs: Any,
469    ) -> None:
470        super().__init__(dict_path, *args, **kwargs)

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
472    def apply(self, doc: Document) -> Document:
473        """
474        >>> DiscardViolenceContentJa()\
475            .apply(Document("<TEST_STRING_OF_VIOLENCE_KEYWORD>")).is_rejected
476        True
477
478        >>> DiscardViolenceContentJa().apply(Document("ほうじ茶")).is_rejected
479        False
480        """
481        return super().apply(doc)
>>> DiscardViolenceContentJa()            .apply(Document("<TEST_STRING_OF_VIOLENCE_KEYWORD>")).is_rejected
True
>>> DiscardViolenceContentJa().apply(Document("ほうじ茶")).is_rejected
False
class DiscardBBSComments(hojichar.core.filter_interface.Filter):
484class DiscardBBSComments(Filter):
485    """
486    正規表現 "BBS Pattern" に `max_allow_num` 回よりたくさんマッチする文書を破棄します.
487    `max_allow_num` のデフォルト値は14です.
488    正規表現 "BBS Pattern" は下記のリンクで検証可能です.
489    https://regex101.com/r/ybQvL2/1
490    """
491
492    def __init__(self, max_allowed_num: int = 14, *args: Any, **kwargs: Any) -> None:
493        super().__init__(*args, **kwargs)
494
495        self.max_allowed_num = max_allowed_num
496        self.keyword_pat = re.compile(
497            r"\d{4}[年\.\-\/][\ ]*\d{1,2}[月\.\-\/][\ ]*\d{1,2}[日]*|コメント|SOLD OUT|レビュー|投稿|ページ|\([月火水木金土日]\)|質問|\d+話|楽天市場|-"  # noqa
498        )
499
500    def apply(self, doc: Document) -> Document:
501        """
502        >>> DiscardBBSComments().apply(Document("楽天市場 質問 投稿 コメント レビュー "*3)).is_rejected
503        True
504
505        >>> DiscardBBSComments().apply(Document("鏡餅")).is_rejected
506        False
507        """
508        bbs_factor = self.keyword_pat.findall(doc.text)
509        if len(bbs_factor) > self.max_allowed_num:
510            doc.is_rejected = True
511        return doc

正規表現 "BBS Pattern" に max_allow_num 回よりたくさんマッチする文書を破棄します. max_allow_num のデフォルト値は14です. 正規表現 "BBS Pattern" は下記のリンクで検証可能です. https://regex101.com/r/ybQvL2/1

DiscardBBSComments(max_allowed_num: int = 14, *args: Any, **kwargs: Any)
492    def __init__(self, max_allowed_num: int = 14, *args: Any, **kwargs: Any) -> None:
493        super().__init__(*args, **kwargs)
494
495        self.max_allowed_num = max_allowed_num
496        self.keyword_pat = re.compile(
497            r"\d{4}[年\.\-\/][\ ]*\d{1,2}[月\.\-\/][\ ]*\d{1,2}[日]*|コメント|SOLD OUT|レビュー|投稿|ページ|\([月火水木金土日]\)|質問|\d+話|楽天市場|-"  # noqa
498        )

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
500    def apply(self, doc: Document) -> Document:
501        """
502        >>> DiscardBBSComments().apply(Document("楽天市場 質問 投稿 コメント レビュー "*3)).is_rejected
503        True
504
505        >>> DiscardBBSComments().apply(Document("鏡餅")).is_rejected
506        False
507        """
508        bbs_factor = self.keyword_pat.findall(doc.text)
509        if len(bbs_factor) > self.max_allowed_num:
510            doc.is_rejected = True
511        return doc
>>> DiscardBBSComments().apply(Document("楽天市場 質問 投稿 コメント レビュー "*3)).is_rejected
True
>>> DiscardBBSComments().apply(Document("鏡餅")).is_rejected
False
class DiscardAds(hojichar.core.filter_interface.Filter):
514class DiscardAds(Filter):
515    """
516    主に広告キーワードを`max_allow_num`より多く含む文書を破棄します.
517    デフォルトで`max_allow_num` は14です.
518    `dict_path` で指定したファイルから, 広告キーワードのリストを得ます.
519    ファイルは単語が改行で羅列されたテキストファイルです.
520    デフォルトの`dict_path` は /hojichar/dict/advertisement_keywords_ja.txt です.
521    """
522
523    def __init__(
524        self,
525        dict_path: Union[str, PathLike] = BASE_PATH / "dict/advertisement_keywords_ja.txt",
526        max_allowed_num: int = 14,
527        *args: Any,
528        **kwargs: Any,
529    ):
530        super().__init__(*args, **kwargs)
531
532        self.max_allow_num = max_allowed_num
533        with open(dict_path, encoding="utf-8") as fp:
534            ng_words = fp.read().split("\n")
535        ng_words = [re.escape(w.strip()) for w in ng_words if not len(w) == 0]
536        pat = r"|".join(ng_words)
537        self.keyword_pat = re.compile(pat)
538
539    def apply(self, doc: Document) -> Document:
540        """
541        >>> DiscardAds().apply(Document("お問い合わせください 営業時間 よくある質問"*5)).is_rejected
542        True
543
544        >>> DiscardAds().apply(Document("おはよう")).is_rejected
545        False
546        """
547        ads_factor = self.keyword_pat.findall(doc.text)
548        if len(ads_factor) > self.max_allow_num:
549            doc.is_rejected = True
550        return doc

主に広告キーワードをmax_allow_numより多く含む文書を破棄します. デフォルトでmax_allow_num は14です. dict_path で指定したファイルから, 広告キーワードのリストを得ます. ファイルは単語が改行で羅列されたテキストファイルです. デフォルトのdict_path は /hojichar/dict/advertisement_keywords_ja.txt です.

DiscardAds( dict_path: Union[str, os.PathLike] = PosixPath('/home/runner/work/HojiChar/HojiChar/hojichar/dict/advertisement_keywords_ja.txt'), max_allowed_num: int = 14, *args: Any, **kwargs: Any)
523    def __init__(
524        self,
525        dict_path: Union[str, PathLike] = BASE_PATH / "dict/advertisement_keywords_ja.txt",
526        max_allowed_num: int = 14,
527        *args: Any,
528        **kwargs: Any,
529    ):
530        super().__init__(*args, **kwargs)
531
532        self.max_allow_num = max_allowed_num
533        with open(dict_path, encoding="utf-8") as fp:
534            ng_words = fp.read().split("\n")
535        ng_words = [re.escape(w.strip()) for w in ng_words if not len(w) == 0]
536        pat = r"|".join(ng_words)
537        self.keyword_pat = re.compile(pat)

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
539    def apply(self, doc: Document) -> Document:
540        """
541        >>> DiscardAds().apply(Document("お問い合わせください 営業時間 よくある質問"*5)).is_rejected
542        True
543
544        >>> DiscardAds().apply(Document("おはよう")).is_rejected
545        False
546        """
547        ads_factor = self.keyword_pat.findall(doc.text)
548        if len(ads_factor) > self.max_allow_num:
549            doc.is_rejected = True
550        return doc
>>> DiscardAds().apply(Document("お問い合わせください 営業時間 よくある質問"*5)).is_rejected
True
>>> DiscardAds().apply(Document("おはよう")).is_rejected
False
class AcceptJapanese(hojichar.core.filter_interface.Filter):
553class AcceptJapanese(Filter):
554    """
555    日本語でないドキュメントを破棄します. 日本語判定は次の手順で行われます.
556        1. テキストを左から`lookup_size` (デフォルトで50字) 参照し,
557        ひらがな・カタカナが存在すれば日本語と判定する.
558    """
559
560    def __init__(self, lookup_size: int = 50, *args: Any, **kwargs: Any) -> None:
561        super().__init__(*args, **kwargs)
562
563        self.lookup_size = lookup_size
564        self.hiragana_katakana_pat = re.compile(r"[ぁ-んァ-ン]")
565
566    def apply(self, doc: Document) -> Document:
567        """
568        >>> AcceptJapanese().apply(Document("This is English document")).is_rejected
569        True
570
571        >>> AcceptJapanese().apply(Document("a"*50 + "あ")).is_rejected
572        True
573
574        >>> AcceptJapanese().apply(Document("ほうじ茶")).is_rejected
575        False
576        """
577        if not self.hiragana_katakana_pat.search(doc.text[: self.lookup_size]):
578            doc.is_rejected = True
579        return doc

日本語でないドキュメントを破棄します. 日本語判定は次の手順で行われます. 1. テキストを左からlookup_size (デフォルトで50字) 参照し, ひらがな・カタカナが存在すれば日本語と判定する.

AcceptJapanese(lookup_size: int = 50, *args: Any, **kwargs: Any)
560    def __init__(self, lookup_size: int = 50, *args: Any, **kwargs: Any) -> None:
561        super().__init__(*args, **kwargs)
562
563        self.lookup_size = lookup_size
564        self.hiragana_katakana_pat = re.compile(r"[ぁ-んァ-ン]")

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
566    def apply(self, doc: Document) -> Document:
567        """
568        >>> AcceptJapanese().apply(Document("This is English document")).is_rejected
569        True
570
571        >>> AcceptJapanese().apply(Document("a"*50 + "あ")).is_rejected
572        True
573
574        >>> AcceptJapanese().apply(Document("ほうじ茶")).is_rejected
575        False
576        """
577        if not self.hiragana_katakana_pat.search(doc.text[: self.lookup_size]):
578            doc.is_rejected = True
579        return doc
>>> AcceptJapanese().apply(Document("This is English document")).is_rejected
True
>>> AcceptJapanese().apply(Document("a"*50 + "あ")).is_rejected
True
>>> AcceptJapanese().apply(Document("ほうじ茶")).is_rejected
False
class DiscardRareKuten(hojichar.core.filter_interface.Filter):
582class DiscardRareKuten(Filter):
583    """
584    日本語でないドキュメントを破棄します. 日本語判定は次の手順で行われます
585    ドキュメントを句点"。"で区切り, 平均文長が
586    `max_avarage_sentence_length` より長い場合は破棄します.
587    `max_avarage_sentence_length` のデフォルト値は100です.
588    このフィルタは, 文章中の句点の割合が少なすぎるドキュメントを破棄します.
589    """
590
591    def __init__(self, max_average_sentence_length: int = 100, *args: Any, **kwargs: Any) -> None:
592        super().__init__(*args, **kwargs)
593
594        self.max_average_sentence_length = max_average_sentence_length
595        self.kuten_pat = re.compile(r"。")
596
597    def apply(self, doc: Document) -> Document:
598        """
599        >>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよ。")).is_rejected
600        False
601        >>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよう。")).is_rejected
602        True
603        """
604        kuten_lst = self.kuten_pat.findall(doc.text)
605        min_kuten_num = len(doc.text) / self.max_average_sentence_length
606        if len(kuten_lst) < min_kuten_num:
607            doc.is_rejected = True
608        return doc

日本語でないドキュメントを破棄します. 日本語判定は次の手順で行われます ドキュメントを句点"。"で区切り, 平均文長が max_avarage_sentence_length より長い場合は破棄します. max_avarage_sentence_length のデフォルト値は100です. このフィルタは, 文章中の句点の割合が少なすぎるドキュメントを破棄します.

DiscardRareKuten(max_average_sentence_length: int = 100, *args: Any, **kwargs: Any)
591    def __init__(self, max_average_sentence_length: int = 100, *args: Any, **kwargs: Any) -> None:
592        super().__init__(*args, **kwargs)
593
594        self.max_average_sentence_length = max_average_sentence_length
595        self.kuten_pat = re.compile(r"。")

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
597    def apply(self, doc: Document) -> Document:
598        """
599        >>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよ。")).is_rejected
600        False
601        >>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよう。")).is_rejected
602        True
603        """
604        kuten_lst = self.kuten_pat.findall(doc.text)
605        min_kuten_num = len(doc.text) / self.max_average_sentence_length
606        if len(kuten_lst) < min_kuten_num:
607            doc.is_rejected = True
608        return doc
>>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよ。")).is_rejected
False
>>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよう。")).is_rejected
True
class HeaderFooterTagsRemover(hojichar.core.filter_interface.Filter):
611class HeaderFooterTagsRemover(Filter):
612    """
613    ドキュメントの冒頭・末尾のトークンを調査し, ヘッダー・フッダー的な
614    タグが存在していた場合, そのトークンを除去します.
615
616    このフィルタを通す前に, 事前にセンテンスレベルにトーカナイズしておいてください.
617    このフィルタでは Document.token にのみ変更が加えられるので, 出力前 あるいは 下流フィルタで
618    Document.text に変更を加える前にトークンをマージしておいてください.
619    """
620
621    def __init__(
622        self,
623        dict_path: Union[str, PathLike] = BASE_PATH / "dict/header_footer_keywords_ja.txt",
624        *args: Any,
625        **kwargs: Any,
626    ) -> None:
627        super().__init__(*args, **kwargs)
628
629        with open(dict_path) as fp:
630            keywords = fp.read().split("\n")
631        keywords = [re.escape(w.strip()) for w in keywords if not len(w) == 0]
632        self.keyword_pat = re.compile(r"|".join(keywords))
633
634    def apply(self, doc: Document) -> Document:
635        if len(doc.tokens) == 0:
636            return doc
637
638        lookup_size = 0
639        if 1 <= len(doc.tokens) < 4:
640            lookup_size = 1
641        elif 4 <= len(doc.tokens) < 6:
642            lookup_size = 2
643        elif 6 <= len(doc.tokens):
644            lookup_size = 3
645
646        for i in range(lookup_size):
647            if self.should_drop_token(doc.tokens[i]):
648                doc.tokens[i].is_rejected = True
649            if self.should_drop_token(doc.tokens[-(i + 1)]):
650                doc.tokens[i].is_rejected = True
651
652        return doc
653
654    def should_drop_token(self, token: Token) -> bool:
655        """
656        >>> HeaderFooterTagsRemover().should_drop_token(Token("<TEST_STRING_OF_KEYWORD>"))
657        True
658
659        >>> HeaderFooterTagsRemover().should_drop_token(Token("ほうじ茶"))
660        False
661
662        Comment.
663        Original legacy code removed a pattern r"« _ | Main | _ »" .
664        In the pattern, "|" is not escaped, so **ANY** string was eliminated.
665        It seems unintended behavior, so I fix this.
666        """
667        if self.keyword_pat.match(token.text):
668            return True
669        else:
670            return False

ドキュメントの冒頭・末尾のトークンを調査し, ヘッダー・フッダー的な タグが存在していた場合, そのトークンを除去します.

このフィルタを通す前に, 事前にセンテンスレベルにトーカナイズしておいてください. このフィルタでは Document.token にのみ変更が加えられるので, 出力前 あるいは 下流フィルタで Document.text に変更を加える前にトークンをマージしておいてください.

HeaderFooterTagsRemover( dict_path: Union[str, os.PathLike] = PosixPath('/home/runner/work/HojiChar/HojiChar/hojichar/dict/header_footer_keywords_ja.txt'), *args: Any, **kwargs: Any)
621    def __init__(
622        self,
623        dict_path: Union[str, PathLike] = BASE_PATH / "dict/header_footer_keywords_ja.txt",
624        *args: Any,
625        **kwargs: Any,
626    ) -> None:
627        super().__init__(*args, **kwargs)
628
629        with open(dict_path) as fp:
630            keywords = fp.read().split("\n")
631        keywords = [re.escape(w.strip()) for w in keywords if not len(w) == 0]
632        self.keyword_pat = re.compile(r"|".join(keywords))

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
634    def apply(self, doc: Document) -> Document:
635        if len(doc.tokens) == 0:
636            return doc
637
638        lookup_size = 0
639        if 1 <= len(doc.tokens) < 4:
640            lookup_size = 1
641        elif 4 <= len(doc.tokens) < 6:
642            lookup_size = 2
643        elif 6 <= len(doc.tokens):
644            lookup_size = 3
645
646        for i in range(lookup_size):
647            if self.should_drop_token(doc.tokens[i]):
648                doc.tokens[i].is_rejected = True
649            if self.should_drop_token(doc.tokens[-(i + 1)]):
650                doc.tokens[i].is_rejected = True
651
652        return doc

Definition of filter behavior.

The document must have a protocol TextContent, and mostly used hojichar.Document class.

In this method, the filter will modify document.text or document.extras and set document.is_rejected = True to discard the document.

Parameters

document : Document Input document

Returns

Document Processed Document

def should_drop_token(self, token: hojichar.core.models.Token) -> bool:
654    def should_drop_token(self, token: Token) -> bool:
655        """
656        >>> HeaderFooterTagsRemover().should_drop_token(Token("<TEST_STRING_OF_KEYWORD>"))
657        True
658
659        >>> HeaderFooterTagsRemover().should_drop_token(Token("ほうじ茶"))
660        False
661
662        Comment.
663        Original legacy code removed a pattern r"« _ | Main | _ »" .
664        In the pattern, "|" is not escaped, so **ANY** string was eliminated.
665        It seems unintended behavior, so I fix this.
666        """
667        if self.keyword_pat.match(token.text):
668            return True
669        else:
670            return False
>>> HeaderFooterTagsRemover().should_drop_token(Token("<TEST_STRING_OF_KEYWORD>"))
True
>>> HeaderFooterTagsRemover().should_drop_token(Token("ほうじ茶"))
False

Comment. Original legacy code removed a pattern r"« _ | Main | _ »" . In the pattern, "|" is not escaped, so ANY string was eliminated. It seems unintended behavior, so I fix this.

class MaskPersonalInformation(hojichar.core.filter_interface.Filter):
673class MaskPersonalInformation(Filter):
674    """
675    ドキュメントに含まれる電話番号・電子メールアドレスを一部マスキングします.
676    """
677
678    def __init__(self, *args: Any, **kwargs: Any) -> None:
679        super().__init__(*args, **kwargs)
680
681        self.phone_pat = re.compile(
682            r"((0|\+\d{1,3}[- ]?)(\d{2}[- ]?\d{4}[- ]?|\d[- ]?\d{4}[- ]?|\d{2}[- ]?\d{3}[- ]?|\d{3}[- ]?\d{2}[- ]?|\d{4}[- ]?\d{1}[- ]?))\d{4}"  # noqa
683        )
684        self.email_pat = re.compile(
685            r"[a-zA-Z0-9!#$%&'*+\-/=?^_`{|}~.]+@[A-Za-z0-9!#$%&'*+\-/=?^_`{|}~.]+(\.[A-Za-z0-9\-]+)"  # noqa
686        )
687
688    def apply(self, doc: Document) -> Document:
689        """
690        >>> MaskPersonalInformation()('06-1234-5678')
691        '06-1234-XXXX'
692        >>> MaskPersonalInformation()('075-123-4567')
693        '075-123-XXXX'
694        >>> MaskPersonalInformation()('0166-12-3456')
695        '0166-12-XXXX'
696        >>> MaskPersonalInformation()('09808-1-2345')
697        '09808-1-XXXX'
698        >>> MaskPersonalInformation()('090-1234-5678')
699        '090-1234-XXXX'
700        >>> MaskPersonalInformation()('0751234567')
701        '075123XXXX'
702        >>> MaskPersonalInformation()('08012345678')
703        '0801234XXXX'
704        >>> MaskPersonalInformation()('連絡は075-123-4567 まで')
705        '連絡は075-123-XXXX まで'
706        >>> MaskPersonalInformation()('+81-80-1234-5678')
707        '+81-80-1234-XXXX'
708        >>> MaskPersonalInformation()('+818012345678')
709        '+81801234XXXX'
710        >>> MaskPersonalInformation()('hogehoge@example.com')
711        'xxxx@yyy.com'
712        >>> MaskPersonalInformation()('何かあれば hogehoge@example.ne.jp まで連絡')
713        '何かあれば xxxx@yyy.jp まで連絡'
714        """
715        text = self.phone_pat.sub(r"\1XXXX", doc.text)
716        text = self.email_pat.sub(r"xxxx@yyy\1", text)
717        doc.text = text
718        return doc

ドキュメントに含まれる電話番号・電子メールアドレスを一部マスキングします.

MaskPersonalInformation(*args: Any, **kwargs: Any)
678    def __init__(self, *args: Any, **kwargs: Any) -> None:
679        super().__init__(*args, **kwargs)
680
681        self.phone_pat = re.compile(
682            r"((0|\+\d{1,3}[- ]?)(\d{2}[- ]?\d{4}[- ]?|\d[- ]?\d{4}[- ]?|\d{2}[- ]?\d{3}[- ]?|\d{3}[- ]?\d{2}[- ]?|\d{4}[- ]?\d{1}[- ]?))\d{4}"  # noqa
683        )
684        self.email_pat = re.compile(
685            r"[a-zA-Z0-9!#$%&'*+\-/=?^_`{|}~.]+@[A-Za-z0-9!#$%&'*+\-/=?^_`{|}~.]+(\.[A-Za-z0-9\-]+)"  # noqa
686        )

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
688    def apply(self, doc: Document) -> Document:
689        """
690        >>> MaskPersonalInformation()('06-1234-5678')
691        '06-1234-XXXX'
692        >>> MaskPersonalInformation()('075-123-4567')
693        '075-123-XXXX'
694        >>> MaskPersonalInformation()('0166-12-3456')
695        '0166-12-XXXX'
696        >>> MaskPersonalInformation()('09808-1-2345')
697        '09808-1-XXXX'
698        >>> MaskPersonalInformation()('090-1234-5678')
699        '090-1234-XXXX'
700        >>> MaskPersonalInformation()('0751234567')
701        '075123XXXX'
702        >>> MaskPersonalInformation()('08012345678')
703        '0801234XXXX'
704        >>> MaskPersonalInformation()('連絡は075-123-4567 まで')
705        '連絡は075-123-XXXX まで'
706        >>> MaskPersonalInformation()('+81-80-1234-5678')
707        '+81-80-1234-XXXX'
708        >>> MaskPersonalInformation()('+818012345678')
709        '+81801234XXXX'
710        >>> MaskPersonalInformation()('hogehoge@example.com')
711        'xxxx@yyy.com'
712        >>> MaskPersonalInformation()('何かあれば hogehoge@example.ne.jp まで連絡')
713        '何かあれば xxxx@yyy.jp まで連絡'
714        """
715        text = self.phone_pat.sub(r"\1XXXX", doc.text)
716        text = self.email_pat.sub(r"xxxx@yyy\1", text)
717        doc.text = text
718        return doc
>>> MaskPersonalInformation()('06-1234-5678')
'06-1234-XXXX'
>>> MaskPersonalInformation()('075-123-4567')
'075-123-XXXX'
>>> MaskPersonalInformation()('0166-12-3456')
'0166-12-XXXX'
>>> MaskPersonalInformation()('09808-1-2345')
'09808-1-XXXX'
>>> MaskPersonalInformation()('090-1234-5678')
'090-1234-XXXX'
>>> MaskPersonalInformation()('0751234567')
'075123XXXX'
>>> MaskPersonalInformation()('08012345678')
'0801234XXXX'
>>> MaskPersonalInformation()('連絡は075-123-4567 まで')
'連絡は075-123-XXXX まで'
>>> MaskPersonalInformation()('+81-80-1234-5678')
'+81-80-1234-XXXX'
>>> MaskPersonalInformation()('+818012345678')
'+81801234XXXX'
>>> MaskPersonalInformation()('hogehoge@example.com')
'xxxx@yyy.com'
>>> MaskPersonalInformation()('何かあれば hogehoge@example.ne.jp まで連絡')
'何かあれば xxxx@yyy.jp まで連絡'
class DiscardTooManyNouns(hojichar.core.filter_interface.Filter):
721class DiscardTooManyNouns(Filter):
722    """
723    [!CAUTION] This filter requires `fugashi` package. Please install it
724    by `pip install 'hojichar[all]'`.
725
726    A filter that removes document with too many nouns in Japanese i.e.,
727    documents such as advertisement, word salad, etc ...
728    """
729
730    def __init__(
731        self, threshold: float = 0.80, max_parse_chars: int = 100_000, *args: Any, **kwargs: Any
732    ) -> None:
733        """
734        Args:
735            threshold: document whose noun ratio is higher than this value will be discarded
736            max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document.
737            *args:
738            **kwargs:
739        """
740        super().__init__(*args, **kwargs)
741        assert is_loaded_extras, (
742            "fugashi is required for this filter. Try pip install 'hojichar[all]'"
743        )
744
745        self.threshold = threshold
746        self.max_parse_chars = max_parse_chars
747        self.tagger = Tagger("-Owakati")
748        assert "unidic" in self.tagger.dictionary_info[0]["filename"], (
749            "MeCab dictionary must be unidic"
750        )
751
752    def _chunk_text(self, text: str) -> Iterable[str]:
753        """Slice text into chunks of `max_parse_chars` length."""
754        step = self.max_parse_chars
755        for i in range(0, len(text), step):
756            yield text[i : i + step]
757
758    def apply(self, doc: Document) -> Document:
759        """
760        >>> DiscardTooManyNouns().apply(Document("自然言語処理大好き!")).is_rejected
761        False
762        >>> DiscardTooManyNouns().apply(Document("リンゴ・オレンジ・ミカン・バナナ セール中")).is_rejected
763        True
764        >>> DiscardTooManyNouns().apply(Document("今日の仙台朝市ではリンゴがセール中")).is_rejected
765        False
766        """
767        # remove "補助記号" from part-of-speech statistics
768        # because they often decrease the noun ratio,
769        # e.g., the sentence "リンゴ・オレンジ・バナナ・" has 補助記号 ratio of 0.5
770        # however, we don't want such sentence
771
772        pos_count: Counter[str] = Counter()
773        for chunk in self._chunk_text(doc.text):
774            for word in self.tagger(chunk):
775                if word.feature.pos1 != "補助記号":
776                    pos_count[word.feature.pos1] += 1
777
778        try:
779            noun_ratio = pos_count["名詞"] / sum(pos_count.values())
780        except ZeroDivisionError:
781            noun_ratio = 0.0
782        if noun_ratio >= self.threshold:
783            doc.is_rejected = True
784        return doc

[!CAUTION] This filter requires fugashi package. Please install it by pip install 'hojichar[all]'.

A filter that removes document with too many nouns in Japanese i.e., documents such as advertisement, word salad, etc ...

DiscardTooManyNouns( threshold: float = 0.8, max_parse_chars: int = 100000, *args: Any, **kwargs: Any)
730    def __init__(
731        self, threshold: float = 0.80, max_parse_chars: int = 100_000, *args: Any, **kwargs: Any
732    ) -> None:
733        """
734        Args:
735            threshold: document whose noun ratio is higher than this value will be discarded
736            max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document.
737            *args:
738            **kwargs:
739        """
740        super().__init__(*args, **kwargs)
741        assert is_loaded_extras, (
742            "fugashi is required for this filter. Try pip install 'hojichar[all]'"
743        )
744
745        self.threshold = threshold
746        self.max_parse_chars = max_parse_chars
747        self.tagger = Tagger("-Owakati")
748        assert "unidic" in self.tagger.dictionary_info[0]["filename"], (
749            "MeCab dictionary must be unidic"
750        )

Args: threshold: document whose noun ratio is higher than this value will be discarded max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document. args: *kwargs:

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
758    def apply(self, doc: Document) -> Document:
759        """
760        >>> DiscardTooManyNouns().apply(Document("自然言語処理大好き!")).is_rejected
761        False
762        >>> DiscardTooManyNouns().apply(Document("リンゴ・オレンジ・ミカン・バナナ セール中")).is_rejected
763        True
764        >>> DiscardTooManyNouns().apply(Document("今日の仙台朝市ではリンゴがセール中")).is_rejected
765        False
766        """
767        # remove "補助記号" from part-of-speech statistics
768        # because they often decrease the noun ratio,
769        # e.g., the sentence "リンゴ・オレンジ・バナナ・" has 補助記号 ratio of 0.5
770        # however, we don't want such sentence
771
772        pos_count: Counter[str] = Counter()
773        for chunk in self._chunk_text(doc.text):
774            for word in self.tagger(chunk):
775                if word.feature.pos1 != "補助記号":
776                    pos_count[word.feature.pos1] += 1
777
778        try:
779            noun_ratio = pos_count["名詞"] / sum(pos_count.values())
780        except ZeroDivisionError:
781            noun_ratio = 0.0
782        if noun_ratio >= self.threshold:
783            doc.is_rejected = True
784        return doc
>>> DiscardTooManyNouns().apply(Document("自然言語処理大好き!")).is_rejected
False
>>> DiscardTooManyNouns().apply(Document("リンゴ・オレンジ・ミカン・バナナ セール中")).is_rejected
True
>>> DiscardTooManyNouns().apply(Document("今日の仙台朝市ではリンゴがセール中")).is_rejected
False
class CharRepetitionRatioFilter(hojichar.core.filter_interface.Filter):
787class CharRepetitionRatioFilter(Filter):
788    """
789    文字Ngramの重なり率(文書中で高頻度文字Ngramが占める割合)を計算して, 重なりの大きいものを除去します.
790    名詞の連続からなるような広告テキストを取り除くのに有効です.
791
792    実装は, BigScience で採用されていた前処理を参考にしています.
793    元実装: https://github.com/bigscience-workshop/data-preparation/blob/9d0588419073cc5bf0fb92b58f37f2a1016572c3/preprocessing/training/01b_oscar_cleaning_and_filtering/filtering.py#L425-L453  # noqa: E501
794
795    「高頻度文字Ngram」は、sqrt(ユニークなNgramの総数)によって求めていますが,
796    これは文書長の影響を軽減するためだとされています.
797
798    掲示板のテキストが引っかかりやすい傾向があります.
799    13: 名無しさん@実況で競馬板アウト 2019/08/18(日) 15:28:46.10 ID:eBvZg8h+0
800    的なものが高頻度で登場するため、文字Ngramの重なり率も高くなってしまう
801    """
802
803    def __init__(
804        self, threshold: float = 0.33, ngram_size: int = 5, *args: Any, **kwargs: Any
805    ) -> None:
806        """
807
808        Args:
809            threshold: document with character repetition ratio higher than this value will be discarded
810            ngram_size: character ngram size. Larger value will decrease the false positive of long documents
811            *args:
812            **kwargs:
813        """  # noqa: E501
814
815        super().__init__(*args, **kwargs)
816        self.threshold = threshold
817        self.ngram_size = ngram_size
818
819    def apply(self, doc: Document) -> Document:
820        ratio = self.compute_character_repetition_ratio(doc.text, self.ngram_size)
821        if ratio >= self.threshold:
822            doc.is_rejected = True
823        return doc
824
825    @staticmethod
826    def compute_character_repetition_ratio(
827        document: str, character_repetition_length: int
828    ) -> float:
829        def get_freq_character_ngrams(document: str, n: int) -> Dict[str, int]:
830            character_ngrams: List[str] = [
831                document[i : i + n] for i in range(len(document) - n + 1)
832            ]
833            freq_character_ngrams_dict: Dict[str, int] = {}
834            for character_ngram in character_ngrams:
835                freq_character_ngrams_dict[character_ngram] = (
836                    freq_character_ngrams_dict.get(character_ngram, 0) + 1
837                )
838            return freq_character_ngrams_dict
839
840        freq_character_ngrams_dict = get_freq_character_ngrams(
841            document, character_repetition_length
842        )
843        if len(freq_character_ngrams_dict) == 0:
844            return 0.0
845        freq_character_ngrams: List[int] = list(freq_character_ngrams_dict.values())
846        freq_character_ngrams = sorted(freq_character_ngrams, reverse=True)
847        val_one = len([el for el in freq_character_ngrams if el == 1])
848        num_rep_character_ngrams = min(
849            int(np.sqrt(len(freq_character_ngrams))),
850            len(freq_character_ngrams) - val_one,
851        )
852        character_repetition_ratio = sum(freq_character_ngrams[:num_rep_character_ngrams]) / sum(
853            freq_character_ngrams
854        )
855        return character_repetition_ratio

文字Ngramの重なり率(文書中で高頻度文字Ngramが占める割合)を計算して, 重なりの大きいものを除去します. 名詞の連続からなるような広告テキストを取り除くのに有効です.

実装は, BigScience で採用されていた前処理を参考にしています. 元実装: https://github.com/bigscience-workshop/data-preparation/blob/9d0588419073cc5bf0fb92b58f37f2a1016572c3/preprocessing/training/01b_oscar_cleaning_and_filtering/filtering.py#L425-L453 # noqa: E501

「高頻度文字Ngram」は、sqrt(ユニークなNgramの総数)によって求めていますが, これは文書長の影響を軽減するためだとされています.

掲示板のテキストが引っかかりやすい傾向があります. 13: 名無しさん@実況で競馬板アウト 2019/08/18(日) 15:28:46.10 ID:eBvZg8h+0 的なものが高頻度で登場するため、文字Ngramの重なり率も高くなってしまう

CharRepetitionRatioFilter( threshold: float = 0.33, ngram_size: int = 5, *args: Any, **kwargs: Any)
803    def __init__(
804        self, threshold: float = 0.33, ngram_size: int = 5, *args: Any, **kwargs: Any
805    ) -> None:
806        """
807
808        Args:
809            threshold: document with character repetition ratio higher than this value will be discarded
810            ngram_size: character ngram size. Larger value will decrease the false positive of long documents
811            *args:
812            **kwargs:
813        """  # noqa: E501
814
815        super().__init__(*args, **kwargs)
816        self.threshold = threshold
817        self.ngram_size = ngram_size

Args: threshold: document with character repetition ratio higher than this value will be discarded ngram_size: character ngram size. Larger value will decrease the false positive of long documents args: *kwargs:

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
819    def apply(self, doc: Document) -> Document:
820        ratio = self.compute_character_repetition_ratio(doc.text, self.ngram_size)
821        if ratio >= self.threshold:
822            doc.is_rejected = True
823        return doc

Definition of filter behavior.

The document must have a protocol TextContent, and mostly used hojichar.Document class.

In this method, the filter will modify document.text or document.extras and set document.is_rejected = True to discard the document.

Parameters

document : Document Input document

Returns

Document Processed Document

@staticmethod
def compute_character_repetition_ratio(document: str, character_repetition_length: int) -> float:
825    @staticmethod
826    def compute_character_repetition_ratio(
827        document: str, character_repetition_length: int
828    ) -> float:
829        def get_freq_character_ngrams(document: str, n: int) -> Dict[str, int]:
830            character_ngrams: List[str] = [
831                document[i : i + n] for i in range(len(document) - n + 1)
832            ]
833            freq_character_ngrams_dict: Dict[str, int] = {}
834            for character_ngram in character_ngrams:
835                freq_character_ngrams_dict[character_ngram] = (
836                    freq_character_ngrams_dict.get(character_ngram, 0) + 1
837                )
838            return freq_character_ngrams_dict
839
840        freq_character_ngrams_dict = get_freq_character_ngrams(
841            document, character_repetition_length
842        )
843        if len(freq_character_ngrams_dict) == 0:
844            return 0.0
845        freq_character_ngrams: List[int] = list(freq_character_ngrams_dict.values())
846        freq_character_ngrams = sorted(freq_character_ngrams, reverse=True)
847        val_one = len([el for el in freq_character_ngrams if el == 1])
848        num_rep_character_ngrams = min(
849            int(np.sqrt(len(freq_character_ngrams))),
850            len(freq_character_ngrams) - val_one,
851        )
852        character_repetition_ratio = sum(freq_character_ngrams[:num_rep_character_ngrams]) / sum(
853            freq_character_ngrams
854        )
855        return character_repetition_ratio
class WordRepetitionRatioFilter(hojichar.core.filter_interface.Filter):
858class WordRepetitionRatioFilter(Filter):
859    """
860    [!CAUTION] This filter requires `fugashi` package. Please install it
861    by `pip install 'hojichar[all]'`.
862
863    単語Ngramの重なり率(文書中で重複する単語Ngramが占める割合)を計算して、重なりの大きいものを弾くためのフィルタ.
864    BigScienceで採用されていた前処理を参考にしている.
865
866    名詞が連打されているような広告テキストを取り除くのに有効な様子
867    まともな文書がたまたま2回繰り返されている場合もあり、これを取り除いて良いのかは分からない
868    例:
869    "ウェブ\n本文: ニコンの上昇率16%超える、今3月期は経常76%の大幅増益見込む(ニコン) 2013年05月10日[minkabu PRESS] - みんなの株式 (みんかぶ)\n2013/05/10(10:57)
870    ニコン<7731.T>が急騰、寄り付き直後に前日比355円高の2537円まで買い上げ
871    られ、上昇率は16%を超えた。外国為替市場で円が1ドル100円台、1ユーロ131円台に入るなど急速に円安が進み、輸出株が軒並み高になる
872    なか、9日取引終了後に発表した前年3月期決算で、今3月期は2ケタ近い増収で大幅増益を見込んだことが買い気を強めさせた。連結売上
873    高は前期比9.8%増の1兆1100億円、経常利益75.8%増の850億円を予想。前期は半導体、電子部品の低迷が足かせになり、2ケタ増収ながら
874    経常46%の大幅減益になったが、レンズ交換式デジタルカメラの拡大や液晶ディスプレイの回復で収益が急回復する。ニコンの株価は10時
875    56分現在2491円(△309円)出所:株経通信(株式会社みんかぶ)\n2013/05/10 - ニコン(7731) の関連ニュース。 ニコン<7731.T>が急騰、寄
876    り付き直後に前日比355円高の2537円まで買い上げられ、上昇率は16%を超えた。外国為替市場で円が1ドル100円台、1ユーロ131円台に入
877    るなど急速に円安が進み、輸出株が軒並み高になるなか、9日取引終了後に発表した前年3月期決算で、今3月期は2ケタ近い増収で大幅増
878    益を見込んだことが買い気を強めさせた。連結売上高は前期比9.8%増の1兆1100億円、経常利益75.8%増の850億円を予想。前期は半導体、
879    電子部品の低迷が足かせになり、2ケタ増収ながら経常46%の大幅減益になったが、レンズ交換式デジタルカメラの拡大や液晶ディスプレ
880    イの回復で収益が急回"
881    """  # noqa: E501
882
883    def __init__(
884        self,
885        threshold: float = 0.40,
886        ngram_size: int = 7,
887        max_parse_chars: int = 100_000,
888        *args: Any,
889        **kwargs: Any,
890    ) -> None:
891        """
892
893        Args:
894            threshold: document whose character repetition ratio is higher than this value will be discarded
895            ngram_size: character ngram size. Larger value will decrease the false positive of long documents
896            max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document.
897            *args:
898            **kwargs:
899        """  # noqa: E501
900        super().__init__(*args, **kwargs)
901        assert is_loaded_extras, (
902            "fugashi is required for this filter. Try pip install 'hojichar[all]'"
903        )
904
905        self.threshold = threshold
906        self.ngram_size = ngram_size
907        self.max_parse_chars = max_parse_chars
908        self.tagger = Tagger("-Owakati")
909
910    def _chunk_text(self, text: str) -> Iterable[str]:
911        """Split text into chunks of `max_parse_chars` length."""
912        step = self.max_parse_chars
913        for i in range(0, len(text), step):
914            yield text[i : i + step]
915
916    def _get_freq_word_ngrams(self, words: List[str], n: int) -> Dict[str, int]:
917        freq: Dict[str, int] = {}
918        if n <= 0 or len(words) < n:
919            return freq
920        for i in range(len(words) - n + 1):
921            key = " ".join(words[i : i + n])
922            freq[key] = freq.get(key, 0) + 1
923        return freq
924
925    def apply(self, doc: Document) -> Document:
926        ratio = self.compute_word_repetition_ratio(doc.text, self.ngram_size)
927        if ratio >= self.threshold:
928            doc.is_rejected = True
929        return doc
930
931    def compute_word_repetition_ratio(self, document: str, n: int) -> float:
932        total_counter: Counter[str] = Counter()
933
934        for chunk in self._chunk_text(document):
935            words = [w.surface for w in self.tagger(chunk)]
936            total_counter.update(self._get_freq_word_ngrams(words, n))
937
938        if not total_counter:
939            return 0.0
940
941        total = sum(total_counter.values())
942        repeated = sum(v for v in total_counter.values() if v > 1)
943        return repeated / total

[!CAUTION] This filter requires fugashi package. Please install it by pip install 'hojichar[all]'.

単語Ngramの重なり率(文書中で重複する単語Ngramが占める割合)を計算して、重なりの大きいものを弾くためのフィルタ.
BigScienceで採用されていた前処理を参考にしている.

名詞が連打されているような広告テキストを取り除くのに有効な様子
まともな文書がたまたま2回繰り返されている場合もあり、これを取り除いて良いのかは分からない
例:
"ウェブ

本文: ニコンの上昇率16%超える、今3月期は経常76%の大幅増益見込む(ニコン) 2013年05月10日[minkabu PRESS] - みんなの株式 (みんかぶ) 2013/05/10(10:57) ニコン<7731.T>が急騰、寄り付き直後に前日比355円高の2537円まで買い上げ られ、上昇率は16%を超えた。外国為替市場で円が1ドル100円台、1ユーロ131円台に入るなど急速に円安が進み、輸出株が軒並み高になる なか、9日取引終了後に発表した前年3月期決算で、今3月期は2ケタ近い増収で大幅増益を見込んだことが買い気を強めさせた。連結売上 高は前期比9.8%増の1兆1100億円、経常利益75.8%増の850億円を予想。前期は半導体、電子部品の低迷が足かせになり、2ケタ増収ながら 経常46%の大幅減益になったが、レンズ交換式デジタルカメラの拡大や液晶ディスプレイの回復で収益が急回復する。ニコンの株価は10時 56分現在2491円(△309円)出所:株経通信(株式会社みんかぶ) 2013/05/10 - ニコン(7731) の関連ニュース。 ニコン<7731.T>が急騰、寄 り付き直後に前日比355円高の2537円まで買い上げられ、上昇率は16%を超えた。外国為替市場で円が1ドル100円台、1ユーロ131円台に入 るなど急速に円安が進み、輸出株が軒並み高になるなか、9日取引終了後に発表した前年3月期決算で、今3月期は2ケタ近い増収で大幅増 益を見込んだことが買い気を強めさせた。連結売上高は前期比9.8%増の1兆1100億円、経常利益75.8%増の850億円を予想。前期は半導体、 電子部品の低迷が足かせになり、2ケタ増収ながら経常46%の大幅減益になったが、レンズ交換式デジタルカメラの拡大や液晶ディスプレ イの回復で収益が急回"

WordRepetitionRatioFilter( threshold: float = 0.4, ngram_size: int = 7, max_parse_chars: int = 100000, *args: Any, **kwargs: Any)
883    def __init__(
884        self,
885        threshold: float = 0.40,
886        ngram_size: int = 7,
887        max_parse_chars: int = 100_000,
888        *args: Any,
889        **kwargs: Any,
890    ) -> None:
891        """
892
893        Args:
894            threshold: document whose character repetition ratio is higher than this value will be discarded
895            ngram_size: character ngram size. Larger value will decrease the false positive of long documents
896            max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document.
897            *args:
898            **kwargs:
899        """  # noqa: E501
900        super().__init__(*args, **kwargs)
901        assert is_loaded_extras, (
902            "fugashi is required for this filter. Try pip install 'hojichar[all]'"
903        )
904
905        self.threshold = threshold
906        self.ngram_size = ngram_size
907        self.max_parse_chars = max_parse_chars
908        self.tagger = Tagger("-Owakati")

Args: threshold: document whose character repetition ratio is higher than this value will be discarded ngram_size: character ngram size. Larger value will decrease the false positive of long documents max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document. args: *kwargs:

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
925    def apply(self, doc: Document) -> Document:
926        ratio = self.compute_word_repetition_ratio(doc.text, self.ngram_size)
927        if ratio >= self.threshold:
928            doc.is_rejected = True
929        return doc

Definition of filter behavior.

The document must have a protocol TextContent, and mostly used hojichar.Document class.

In this method, the filter will modify document.text or document.extras and set document.is_rejected = True to discard the document.

Parameters

document : Document Input document

Returns

Document Processed Document

def compute_word_repetition_ratio(self, document: str, n: int) -> float:
931    def compute_word_repetition_ratio(self, document: str, n: int) -> float:
932        total_counter: Counter[str] = Counter()
933
934        for chunk in self._chunk_text(document):
935            words = [w.surface for w in self.tagger(chunk)]
936            total_counter.update(self._get_freq_word_ngrams(words, n))
937
938        if not total_counter:
939            return 0.0
940
941        total = sum(total_counter.values())
942        repeated = sum(v for v in total_counter.values() if v > 1)
943        return repeated / total
class DiscardTooManySpecialToken(hojichar.core.filter_interface.Filter):
946class DiscardTooManySpecialToken(Filter):
947    """
948    [!CAUTION] This filter requires `emoji` package. Please install it
949    by `pip install 'hojichar[all]'`.
950
951    句読点を含む記号、空白、絵文字、その他特殊な文字を一定の割合以上含むような文書を取り除くためのフィルタ
952    元実装: BigScience https://github.com/bigscience-workshop/data-preparation/blob/9d0588419073cc5bf0fb92b58f37f2a1016572c3/preprocessing/training/01b_oscar_cleaning_and_filtering/parameters_filtering.py#L5-L16  # noqa: E501
953    """
954
955    def __init__(self, threshold: float = 0.4, *args: Any, **kwargs: Any) -> None:
956        """
957
958        Args:
959            threshold: document whose special token ratio is higher than this value will be discarded
960            *args:
961            **kwargs:
962        """  # noqa: E501
963        super().__init__(*args, **kwargs)
964
965        # digits are not regarded as special tokens
966        # otherwise many false positives are made, i.e., good documents discarded
967        main_special_characters = string.punctuation + string.whitespace  # + string.digits
968        other_special_characters = (
969            "’ “— ™ – •‘œ    ˜ ‚ƒ„’“”–▬…✦�­£​•€«»°·═"
970            "×士^˘⇓()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰……‑≤≥‖"
971            "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†:⁄♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚"
972            "゜ʼ≖ʼ¤℃√!?【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖"
973            "」﴾》�"
974        )
975
976        en_emoji = emoji.EMOJI_DATA.keys()
977
978        special_characters_default = set(main_special_characters + other_special_characters)
979        special_characters_default.update(en_emoji)
980        self.special_characters = special_characters_default
981
982        self.threshold = threshold
983
984    def _compute_special_characters_ratio(self, text: str) -> float:
985        if len(text) == 0:
986            return 0
987
988        special_characters_ratio = len(
989            [char for char in text if char in self.special_characters]
990        ) / len(text)
991        return special_characters_ratio
992
993    def apply(self, doc: Document) -> Document:
994        special_characters_ratio = self._compute_special_characters_ratio(doc.text)
995
996        if special_characters_ratio > self.threshold:
997            doc.is_rejected = True
998        return doc

[!CAUTION] This filter requires emoji package. Please install it by pip install 'hojichar[all]'.

句読点を含む記号、空白、絵文字、その他特殊な文字を一定の割合以上含むような文書を取り除くためのフィルタ 元実装: BigScience https://github.com/bigscience-workshop/data-preparation/blob/9d0588419073cc5bf0fb92b58f37f2a1016572c3/preprocessing/training/01b_oscar_cleaning_and_filtering/parameters_filtering.py#L5-L16 # noqa: E501

DiscardTooManySpecialToken(threshold: float = 0.4, *args: Any, **kwargs: Any)
955    def __init__(self, threshold: float = 0.4, *args: Any, **kwargs: Any) -> None:
956        """
957
958        Args:
959            threshold: document whose special token ratio is higher than this value will be discarded
960            *args:
961            **kwargs:
962        """  # noqa: E501
963        super().__init__(*args, **kwargs)
964
965        # digits are not regarded as special tokens
966        # otherwise many false positives are made, i.e., good documents discarded
967        main_special_characters = string.punctuation + string.whitespace  # + string.digits
968        other_special_characters = (
969            "’ “— ™ – •‘œ    ˜ ‚ƒ„’“”–▬…✦�­£​•€«»°·═"
970            "×士^˘⇓()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰……‑≤≥‖"
971            "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†:⁄♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚"
972            "゜ʼ≖ʼ¤℃√!?【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖"
973            "」﴾》�"
974        )
975
976        en_emoji = emoji.EMOJI_DATA.keys()
977
978        special_characters_default = set(main_special_characters + other_special_characters)
979        special_characters_default.update(en_emoji)
980        self.special_characters = special_characters_default
981
982        self.threshold = threshold

Args: threshold: document whose special token ratio is higher than this value will be discarded args: *kwargs:

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
993    def apply(self, doc: Document) -> Document:
994        special_characters_ratio = self._compute_special_characters_ratio(doc.text)
995
996        if special_characters_ratio > self.threshold:
997            doc.is_rejected = True
998        return doc

Definition of filter behavior.

The document must have a protocol TextContent, and mostly used hojichar.Document class.

In this method, the filter will modify document.text or document.extras and set document.is_rejected = True to discard the document.

Parameters

document : Document Input document

Returns

Document Processed Document

class SingleCharacterRepetitionFilter(hojichar.core.filter_interface.Filter):
1001class SingleCharacterRepetitionFilter(Filter):
1002    """
1003    単一文字が大量に繰り返されているような文書を取り除くためのフィルタ
1004    そのような文書はノイズである可能性が高いため
1005    参考: BigScienceプロジェクトによると、oscarデータセットの中にバックスラッシュだけを2M個含むような文書が含まれていたらしい
1006    https://github.com/bigscience-workshop/bigscience/blob/master/train/tr8-104B-wide/chronicles.md#2m-backslash-only-samples-in-our-dataset  # noqa: E501
1007    """
1008
1009    def __init__(
1010        self,
1011        threshold: int = 200,
1012        *args: Any,
1013        **kwargs: Any,
1014    ) -> None:
1015        """
1016        Args:
1017            threshold: The document is removed if character is repeated for this value or more
1018            *args:
1019            **kwargs:
1020        """
1021        super().__init__(*args, **kwargs)
1022        self.threshold = threshold
1023
1024    def _is_repeat_contained(self, text: str) -> bool:
1025        groups = groupby(text)
1026        is_repeat_contained = any(sum(1 for _ in group) >= self.threshold for _, group in groups)
1027        return is_repeat_contained
1028
1029    def apply(self, doc: Document) -> Document:
1030        if self._is_repeat_contained(doc.text):
1031            doc.is_rejected = True
1032        return doc

単一文字が大量に繰り返されているような文書を取り除くためのフィルタ そのような文書はノイズである可能性が高いため 参考: BigScienceプロジェクトによると、oscarデータセットの中にバックスラッシュだけを2M個含むような文書が含まれていたらしい https://github.com/bigscience-workshop/bigscience/blob/master/train/tr8-104B-wide/chronicles.md#2m-backslash-only-samples-in-our-dataset # noqa: E501

SingleCharacterRepetitionFilter(threshold: int = 200, *args: Any, **kwargs: Any)
1009    def __init__(
1010        self,
1011        threshold: int = 200,
1012        *args: Any,
1013        **kwargs: Any,
1014    ) -> None:
1015        """
1016        Args:
1017            threshold: The document is removed if character is repeated for this value or more
1018            *args:
1019            **kwargs:
1020        """
1021        super().__init__(*args, **kwargs)
1022        self.threshold = threshold

Args: threshold: The document is removed if character is repeated for this value or more args: *kwargs:

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
1029    def apply(self, doc: Document) -> Document:
1030        if self._is_repeat_contained(doc.text):
1031            doc.is_rejected = True
1032        return doc

Definition of filter behavior.

The document must have a protocol TextContent, and mostly used hojichar.Document class.

In this method, the filter will modify document.text or document.extras and set document.is_rejected = True to discard the document.

Parameters

document : Document Input document

Returns

Document Processed Document

class DiscardTooManyEndingEllipsis(hojichar.core.filter_interface.Filter):
1035class DiscardTooManyEndingEllipsis(Filter):
1036    """
1037    ellipsisで終わるような行が大量に含まれるような文書を取り除くためのフィルタです.
1038    ellipsisとしては ... と … を用いている
1039    同様のフィルタが RedPajama v2で用いられています.
1040
1041    例として, 以下のような文書を検知します.
1042    ```
1043    ペアーズは女性、という驚愕の過食が出ているのをごアラサーですか。時代から付...
1044    バツイチアラフォー 婚活ち女性の特徴と子持な付...
1045    ```
1046
1047    デフォルトではしきい値を0.7としているが, これはC4から0.1%を削るような設定であり、
1048    precisionを重視した設定です.
1049    """
1050
1051    def __init__(
1052        self,
1053        threshold: float = 0.7,
1054        *args: Any,
1055        **kwargs: Any,
1056    ) -> None:
1057        """
1058        Args:
1059            threshold: The document is removed if ratio of lines ending with ellipsis is higher than this value
1060            *args:
1061            **kwargs:
1062        """  # noqa: E501
1063        super().__init__(*args, **kwargs)
1064        self.threshold = threshold
1065        self.ellipsis_pattern = re.compile(r"(\.{3}|…)\n")  # matches ...\n and …\n
1066
1067    def apply(self, doc: Document) -> Document:
1068        ellipsis_count = len(self.ellipsis_pattern.findall(doc.text))
1069        newline_count = max(doc.text.count("\n"), 1)  # avoid zero division
1070        ellipsis_ratio = ellipsis_count / newline_count
1071
1072        if ellipsis_ratio > self.threshold:
1073            doc.is_rejected = True
1074        return doc

ellipsisで終わるような行が大量に含まれるような文書を取り除くためのフィルタです. ellipsisとしては ... と … を用いている 同様のフィルタが RedPajama v2で用いられています.

例として, 以下のような文書を検知します.

ペアーズは女性、という驚愕の過食が出ているのをごアラサーですか。時代から付...
バツイチアラフォー 婚活ち女性の特徴と子持な付...

デフォルトではしきい値を0.7としているが, これはC4から0.1%を削るような設定であり、 precisionを重視した設定です.

DiscardTooManyEndingEllipsis(threshold: float = 0.7, *args: Any, **kwargs: Any)
1051    def __init__(
1052        self,
1053        threshold: float = 0.7,
1054        *args: Any,
1055        **kwargs: Any,
1056    ) -> None:
1057        """
1058        Args:
1059            threshold: The document is removed if ratio of lines ending with ellipsis is higher than this value
1060            *args:
1061            **kwargs:
1062        """  # noqa: E501
1063        super().__init__(*args, **kwargs)
1064        self.threshold = threshold
1065        self.ellipsis_pattern = re.compile(r"(\.{3}|…)\n")  # matches ...\n and …\n

Args: threshold: The document is removed if ratio of lines ending with ellipsis is higher than this value args: *kwargs:

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
1067    def apply(self, doc: Document) -> Document:
1068        ellipsis_count = len(self.ellipsis_pattern.findall(doc.text))
1069        newline_count = max(doc.text.count("\n"), 1)  # avoid zero division
1070        ellipsis_ratio = ellipsis_count / newline_count
1071
1072        if ellipsis_ratio > self.threshold:
1073            doc.is_rejected = True
1074        return doc

Definition of filter behavior.

The document must have a protocol TextContent, and mostly used hojichar.Document class.

In this method, the filter will modify document.text or document.extras and set document.is_rejected = True to discard the document.

Parameters

document : Document Input document

Returns

Document Processed Document

class DiscardTooShortLines(hojichar.core.filter_interface.Filter):
1077class DiscardTooShortLines(Filter):
1078    """
1079    短い行を大量に含む文書を捨てるためのフィルタです.
1080
1081    メニューバーやパンくずリストのような要素を大量に含む文書を取り除くのに有効です.
1082    """
1083
1084    def __init__(self, threshold: float = 0.5, *args: Any, **kwargs: Any) -> None:
1085        """
1086        Args:
1087            threshold: The document is removed if the ratio of short (<10 chars) lines are more than this value.
1088            *args:
1089            **kwargs:
1090        """  # noqa: E501
1091        super().__init__(*args, **kwargs)
1092        self.threshold = threshold
1093        # この値は適当に決め打ち
1094        self.minimum_line_length = 10
1095
1096    def apply(self, doc: Document) -> Document:
1097        lines = [len(x) for x in doc.text.split("\n")]
1098        short_lines = [x for x in lines if x <= self.minimum_line_length]
1099        if (len(short_lines) / len(lines)) > self.threshold:
1100            doc.is_rejected = True
1101        return doc

短い行を大量に含む文書を捨てるためのフィルタです.

メニューバーやパンくずリストのような要素を大量に含む文書を取り除くのに有効です.

DiscardTooShortLines(threshold: float = 0.5, *args: Any, **kwargs: Any)
1084    def __init__(self, threshold: float = 0.5, *args: Any, **kwargs: Any) -> None:
1085        """
1086        Args:
1087            threshold: The document is removed if the ratio of short (<10 chars) lines are more than this value.
1088            *args:
1089            **kwargs:
1090        """  # noqa: E501
1091        super().__init__(*args, **kwargs)
1092        self.threshold = threshold
1093        # この値は適当に決め打ち
1094        self.minimum_line_length = 10

Args: threshold: The document is removed if the ratio of short (<10 chars) lines are more than this value. args: *kwargs:

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
1096    def apply(self, doc: Document) -> Document:
1097        lines = [len(x) for x in doc.text.split("\n")]
1098        short_lines = [x for x in lines if x <= self.minimum_line_length]
1099        if (len(short_lines) / len(lines)) > self.threshold:
1100            doc.is_rejected = True
1101        return doc

Definition of filter behavior.

The document must have a protocol TextContent, and mostly used hojichar.Document class.

In this method, the filter will modify document.text or document.extras and set document.is_rejected = True to discard the document.

Parameters

document : Document Input document

Returns

Document Processed Document