hojichar.filters.document_filters
1import json 2import logging 3import pathlib 4import re 5import string 6import time 7import unicodedata 8from collections import Counter 9from itertools import groupby 10from os import PathLike 11from typing import Any, Dict, Iterable, List, Optional, Union 12 13import numpy as np 14 15import hojichar 16from hojichar.core.filter_interface import Filter 17from hojichar.core.models import Document, Token 18 19try: 20 import emoji 21 from fugashi import Tagger # type: ignore 22 23 is_loaded_extras = True 24except ImportError: 25 is_loaded_extras = False 26 27BASE_PATH = pathlib.Path(hojichar.__path__[0]) 28logger = logging.getLogger(__name__) 29 30 31class ExampleHojiChar(Filter): 32 """基本的なフィルタの実装例です. 末尾に'<hojichar>'を追加します.""" 33 34 def apply(self, document: Document) -> Document: 35 """ 36 >>> ExampleHojiChar()("hello, world") 37 'hello, world<hojichar>' 38 """ 39 document.text += "<hojichar>" 40 return document 41 42 43class ExampleDiscardDocumentContainKeyword(Filter): 44 """特定のキーワードを持つドキュメントを破棄するようなフィルタの実装例です.""" 45 46 def __init__(self, keyword: str, *args: Any, **kwargs: Any) -> None: 47 super().__init__(*args, **kwargs) 48 self.keyword = keyword 49 50 def apply(self, document: Document) -> Document: 51 """ 52 >>> ExampleDiscardDocumentContainKeyword("バカ").apply(Document("あいつはバカだ")).is_rejected 53 True 54 """ 55 if self.keyword in document.text: 56 document.is_rejected = True 57 return document 58 59 60class Identity(Filter): 61 """何も変化を加えないフィルタです. テスト・デバッグに用いられます.""" 62 63 def apply(self, document: Document) -> Document: 64 return document 65 66 67class DiscardAll(Filter): 68 """ 69 すべてのドキュメントを破棄するフィルタです. 70 テスト・デバッグに用いられます. 71 """ 72 73 def apply(self, document: Document) -> Document: 74 document.is_rejected = True 75 return document 76 77 78class ApplyDiscard(Filter): 79 """ 80 上流フィルタで破棄された`Document`を空文字列にします. 81 82 `Document.is_rejected=True` の ドキュメントは無視されるため, 83 このフィルタを `Compose` のコンストラクタに渡しても動作しません. 84 このフィルタは主に`Compose` 内部や, `discard_filtered=False` を指定 85 したデバッグ時などに利用されます. 86 """ 87 88 def __init__(self, *args: Any, **kwargs: Any) -> None: 89 super().__init__(*args, **kwargs) 90 91 def apply(self, document: Document) -> Document: 92 """ 93 >>> ApplyDiscard().apply(Document(text="hello", is_rejected=True)).text 94 '' 95 """ 96 if document.is_rejected: 97 document.text = "" 98 99 return document 100 101 102class Sleep(Filter): 103 """ 104 デバッグ用のフィルタです. 指定秒スリープします. 105 """ 106 107 def __init__(self, time: float = 1.0, *args: Any, **kwargs: Any) -> None: 108 super().__init__(*args, **kwargs) 109 self.time = time 110 111 def apply(self, document: Document) -> Document: 112 """ 113 >>> Sleep(0.1)('hello') # After 0.1 seconds, 114 'hello' 115 """ 116 time.sleep(self.time) 117 return document 118 119 120class DocumentNormalizer(Filter): 121 """ 122 Unicode の正規化をします. 123 """ 124 125 def __init__(self, *args: Any, **kwargs: Any) -> None: 126 super().__init__(*args, **kwargs) 127 128 def apply(self, document: Document) -> Document: 129 document.text = unicodedata.normalize("NFKC", document.text) 130 return document 131 132 133class JSONLoader(Filter): 134 """ 135 テキストを Json として解釈し, `key` で指定した要素を文字列として 136 doument に格納します.デフォルトの `key` は 'text' です. 137 138 Json の読み込み, あるいは `key` の読み込みに失敗した際には例外を送出します. 139 これらを無視する場合は, `ignore=True` にします. その際, 読み込みに失敗 140 したドキュメントは破棄されます. 141 142 入力 Json に `extras` キー(辞書形式)が含まれている場合, Document.extras に自動的にマージされます。 143 さらに `extra_keys` でフィールドを指定すると, それらの値も Document.extras に追記され, 既存の extras 144 を上書きせずに統合できます。 145 """ 146 147 def __init__( 148 self, 149 key: str = "text", 150 ignore: bool = False, 151 extra_keys: Optional[List[str]] = None, 152 *args: Any, 153 **kwargs: Any, 154 ) -> None: 155 super().__init__(*args, **kwargs) 156 self.key = key 157 self.ignore = ignore 158 self.extra_keys = extra_keys 159 160 def apply(self, document: Document) -> Document: 161 """ 162 >>> JSONLoader()( '{"text": "hello, world", "words": 2}' ) 163 'hello, world' 164 165 >>> JSONLoader()( '{"text": hello, world ....' ) # Broken JSON 166 Traceback (most recent call last): 167 ... 168 json.decoder.JSONDecodeError: Expecting value: line 1 column 10 (char 9) 169 170 >>> JSONLoader()( '{"words": 2}' ) 171 Traceback (most recent call last): 172 ... 173 KeyError: 'text' 174 175 >>> JSONLoader(ignore=True).apply(Document('{"text": hello, world ....' )).is_rejected 176 True 177 """ 178 try: 179 data = json.loads(document.text) 180 document.text = str(data[self.key]) 181 if "extras" in data and isinstance(data["extras"], dict): 182 document.extras.update(data["extras"]) 183 if self.extra_keys is not None: 184 for key in self.extra_keys: 185 if key not in data: 186 continue 187 if key == "extras" and isinstance(data[key], dict): 188 document.extras.update(data[key]) 189 else: 190 document.extras[key] = data[key] 191 except Exception as e: 192 logger.error(f"Failed to parsing in JSONLoader. Input document: \n{document.text}") 193 if self.ignore: 194 document.is_rejected = True 195 return document 196 else: 197 raise e 198 199 return document 200 201 202class JSONDumper(Filter): 203 """ 204 Document.text の文字列を json に変換します. 205 必要に応じ Document のメタデータを付与します. これはドキュメントの破棄事由が含まれ、偽陽性の分析に有効です。 206 デフォルトで `skip_rejected` が `False` にセットされており、Document の破棄フラグにかかわらず 207 処理されます。 208 """ 209 210 def __init__( 211 self, 212 dump_reason: bool = False, 213 p: float = 1, 214 skip_rejected: bool = False, 215 export_extras: bool = False, 216 *args: Any, 217 **kwargs: Any, 218 ) -> None: 219 """ 220 Args: 221 dump_reason (bool, optional): `is_rejected`, `reason` エントリをダンプします. Defaults to False. 222 p (float, optional): Apply probability. Defaults to 1. 223 skip_rejected (bool, optional): 破棄済みサンプルを排除しません. 224 """ 225 super().__init__(p, skip_rejected, *args, **kwargs) 226 self.dump_reason = dump_reason 227 self.export_extras = export_extras 228 229 def apply(self, document: Document) -> Document: 230 """ 231 >>> JSONDumper()("hojichar") 232 '{"text": "hojichar"}' 233 """ 234 text = document.text 235 if self.dump_reason: 236 if self.export_extras: 237 output_extras = dict(document.extras) 238 document.text = json.dumps( 239 { 240 "text": text, 241 "is_rejected": document.is_rejected, 242 "reason": document.reject_reason, 243 "extras": output_extras, 244 }, 245 ensure_ascii=False, 246 ) 247 else: 248 document.text = json.dumps( 249 { 250 "text": text, 251 "is_rejected": document.is_rejected, 252 "reason": document.reject_reason, 253 }, 254 ensure_ascii=False, 255 ) 256 else: 257 if self.export_extras: 258 output_extras = dict(document.extras) 259 document.text = json.dumps( 260 { 261 "text": text, 262 "extras": output_extras, 263 }, 264 ensure_ascii=False, 265 ) 266 else: 267 document.text = json.dumps({"text": text}, ensure_ascii=False) 268 return document 269 270 271class DocumentLengthFilter(Filter): 272 """ 273 `min_doc_len`, `max_doc_len` で指定した上限・下限の範囲内にないドキュメントを破棄します. 274 デフォルトでは 200字 以上 50000字以内のテキストが受理されます. 275 """ 276 277 def __init__( 278 self, 279 min_doc_len: Optional[int] = None, 280 max_doc_len: Optional[int] = None, 281 *args: Any, 282 **kwargs: Any, 283 ) -> None: 284 super().__init__(*args, **kwargs) 285 286 self.min_doc_len = min_doc_len 287 self.max_doc_len = max_doc_len 288 289 def apply(self, doc: Document) -> Document: 290 """ 291 >>> DocumentLengthFilter(min_doc_len=5).apply(Document("1234")).is_rejected 292 True 293 """ 294 doc_len = len(doc.text) 295 if self.min_doc_len is not None: 296 if doc_len < self.min_doc_len: 297 doc.is_rejected = True 298 if self.max_doc_len is not None: 299 if self.max_doc_len < doc_len: 300 doc.is_rejected = True 301 return doc 302 303 304class NgWordsFilterJa(Filter): 305 """ 306 日本語のNGワード(および不適切語)を含む文書を破棄します. 307 `dict_path` で指定したファイルから, キーワードのリストを得ます. 308 ファイルは単語が改行で羅列されたテキストファイルです. 309 310 `ignore_confused` を `True` にすると, 311 偽陽性を軽減するために, カタカナのNGワードは前後にカタカナが無い場合のみNG判定されます. 312 デフォルト値は `False` です. 313 """ 314 315 def __init__( 316 self, 317 dict_path: Union[str, PathLike], 318 ignore_confused: bool = False, 319 *args: Any, 320 **kwargs: Any, 321 ) -> None: 322 super().__init__(*args, **kwargs) 323 324 with open(dict_path, encoding="utf-8") as fp: 325 ng_words = fp.read().split("\n") 326 ng_words = [w.strip() for w in ng_words if not len(w) == 0] 327 328 if ignore_confused: 329 words_katakana = [] 330 words_not_katakana = [] 331 for w in ng_words: 332 if re.fullmatch(r"[ァ-ヴー]+", w): 333 words_katakana.append(re.escape(w)) 334 else: 335 words_not_katakana.append(re.escape(w)) 336 katakana_pat = "|".join(words_katakana) 337 katakana_pat = rf"(?<![ァ-ヴー])({katakana_pat})(?![ァ-ヴー])" 338 pat = "|".join(words_not_katakana) + "|" + katakana_pat 339 self.keyword_pat = re.compile(pat) 340 else: 341 ng_words = [re.escape(w) for w in ng_words] 342 pat = "|".join(ng_words) 343 self.keyword_pat = re.compile(pat) 344 345 def apply(self, doc: Document) -> Document: 346 regex_match = self.keyword_pat.search(doc.text) 347 if regex_match: 348 doc.is_rejected = True 349 self.matched_text = regex_match.group() 350 self.matched_text_neighbor = doc.text[ 351 regex_match.start() - 20 : regex_match.end() + 20 352 ] 353 354 return doc 355 356 357class NgWordsFilterEn(Filter): 358 """ 359 英語のNGワード(および不適切語)を含む文書を破棄します. 360 `dict_path` で指定したファイルから, キーワードのリストを得ます. 361 ファイルは単語が改行で羅列されたテキストファイルです. 362 """ 363 364 def __init__(self, dict_path: Union[str, PathLike], *args: Any, **kwargs: Any) -> None: 365 super().__init__(*args, **kwargs) 366 367 with open(dict_path, encoding="utf-8") as fp: 368 ng_words = fp.read().split("\n") 369 ng_words = [re.escape(w.strip()) for w in ng_words if not len(w) == 0] 370 pat = "|".join(ng_words) 371 # 英語のパターンにマッチするようにしている, \s[単語]\s や [単語]. [単語], などにマッチ. 372 self.keyword_pat = re.compile(rf"(?:^| )({pat})(?:( |,|\.)|$)", re.IGNORECASE) 373 374 def apply(self, doc: Document) -> Document: 375 if self.keyword_pat.search(doc.text): 376 doc.is_rejected = True 377 return doc 378 379 380class DiscardAdultContentJa(NgWordsFilterJa): 381 """ 382 日本語のアダルトキーワード(および不適切語)を含む文書を破棄します. 383 `dict_path` で指定したファイルから, キーワードのリストを得ます. 384 ファイルは単語が改行で羅列されたテキストファイルです. 385 デフォルトの`dict_path` は /hojichar/dict/adult_keywords_ja.txt です. 386 """ 387 388 def __init__( 389 self, 390 dict_path: Union[str, PathLike] = BASE_PATH / "dict/adult_keywords_ja.txt", 391 *args: Any, 392 **kwargs: Any, 393 ) -> None: 394 super().__init__(dict_path, *args, **kwargs) 395 396 def apply(self, doc: Document) -> Document: 397 """ 398 >>> DiscardAdultContentJa().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected 399 True 400 401 >>> DiscardAdultContentJa().apply(Document("ほうじ茶")).is_rejected 402 False 403 404 挙動は正しいが誤検知しているケース. 他にも, サック in リュックサック, 405 >>> DiscardAdultContentJa().apply(Document("アスパラガス")).is_rejected \ 406 # Matching with NG keyword "アス" 407 True 408 """ 409 return super().apply(doc) 410 411 412class DiscardAdultContentEn(NgWordsFilterEn): 413 """ 414 英語のアダルトキーワード(および不適切語)を含む文書を破棄します. 415 `dict_path` で指定したファイルから, キーワードのリストを得ます. 416 ファイルは単語が改行で羅列されたテキストファイルです. 417 デフォルトの`dict_path` は /hojichar/dict/adult_keywords_en.txt です. 418 """ 419 420 def __init__( 421 self, 422 dict_path: Union[str, PathLike] = BASE_PATH / "dict/adult_keywords_en.txt", 423 *args: Any, 424 **kwargs: Any, 425 ) -> None: 426 super().__init__(dict_path, *args, **kwargs) 427 428 def apply(self, doc: Document) -> Document: 429 """ 430 >>> DiscardAdultContentEn().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected 431 True 432 433 >>> DiscardAdultContentEn().apply(Document("hojichar")).is_rejected 434 False 435 """ 436 return super().apply(doc) 437 438 439class DiscardDiscriminationContentJa(NgWordsFilterJa): 440 """ 441 日本語の差別キーワード(および不適切語)を含む文書を破棄します. 442 `dict_path` で指定したファイルから, キーワードのリストを得ます. 443 ファイルは単語が改行で羅列されたテキストファイルです. 444 デフォルトの`dict_path` は /hojichar/dict/discrimination_keywords_ja.txt です. 445 """ 446 447 def __init__( 448 self, 449 dict_path: Union[str, PathLike] = BASE_PATH / "dict/discrimination_keywords_ja.txt", 450 *args: Any, 451 **kwargs: Any, 452 ): 453 super().__init__(dict_path, *args, **kwargs) 454 455 def apply(self, doc: Document) -> Document: 456 """ 457 >>> DiscardDiscriminationContentJa().\ 458 apply(Document("<TEST_STRING_OF_DISCRIMINATION_KEYWORD>")).is_rejected 459 True 460 461 >>> DiscardDiscriminationContentJa().apply(Document("ほうじ茶")).is_rejected 462 False 463 """ 464 return super().apply(doc) 465 466 467class DiscardViolenceContentJa(NgWordsFilterJa): 468 """ 469 日本語の暴力・脅迫を示唆するキーワードを含む文書を破棄します. 470 `dict_path` で指定したファイルから, キーワードのリストを得ます. 471 ファイルは単語が改行で羅列されたテキストファイルです. 472 デフォルトの`dict_path` は /hojichar/dict/violence_keywords_ja.txt です. 473 """ 474 475 def __init__( 476 self, 477 dict_path: Union[str, PathLike] = BASE_PATH / "dict/violence_keywords_ja.txt", 478 *args: Any, 479 **kwargs: Any, 480 ) -> None: 481 super().__init__(dict_path, *args, **kwargs) 482 483 def apply(self, doc: Document) -> Document: 484 """ 485 >>> DiscardViolenceContentJa()\ 486 .apply(Document("<TEST_STRING_OF_VIOLENCE_KEYWORD>")).is_rejected 487 True 488 489 >>> DiscardViolenceContentJa().apply(Document("ほうじ茶")).is_rejected 490 False 491 """ 492 return super().apply(doc) 493 494 495class DiscardBBSComments(Filter): 496 """ 497 正規表現 "BBS Pattern" に `max_allow_num` 回よりたくさんマッチする文書を破棄します. 498 `max_allow_num` のデフォルト値は14です. 499 正規表現 "BBS Pattern" は下記のリンクで検証可能です. 500 https://regex101.com/r/ybQvL2/1 501 """ 502 503 def __init__(self, max_allowed_num: int = 14, *args: Any, **kwargs: Any) -> None: 504 super().__init__(*args, **kwargs) 505 506 self.max_allowed_num = max_allowed_num 507 self.keyword_pat = re.compile( 508 r"\d{4}[年\.\-\/][\ ]*\d{1,2}[月\.\-\/][\ ]*\d{1,2}[日]*|コメント|SOLD OUT|レビュー|投稿|ページ|\([月火水木金土日]\)|質問|\d+話|楽天市場|-" # noqa 509 ) 510 511 def apply(self, doc: Document) -> Document: 512 """ 513 >>> DiscardBBSComments().apply(Document("楽天市場 質問 投稿 コメント レビュー "*3)).is_rejected 514 True 515 516 >>> DiscardBBSComments().apply(Document("鏡餅")).is_rejected 517 False 518 """ 519 bbs_factor = self.keyword_pat.findall(doc.text) 520 if len(bbs_factor) > self.max_allowed_num: 521 doc.is_rejected = True 522 return doc 523 524 525class DiscardAds(Filter): 526 """ 527 主に広告キーワードを`max_allow_num`より多く含む文書を破棄します. 528 デフォルトで`max_allow_num` は14です. 529 `dict_path` で指定したファイルから, 広告キーワードのリストを得ます. 530 ファイルは単語が改行で羅列されたテキストファイルです. 531 デフォルトの`dict_path` は /hojichar/dict/advertisement_keywords_ja.txt です. 532 """ 533 534 def __init__( 535 self, 536 dict_path: Union[str, PathLike] = BASE_PATH / "dict/advertisement_keywords_ja.txt", 537 max_allowed_num: int = 14, 538 *args: Any, 539 **kwargs: Any, 540 ): 541 super().__init__(*args, **kwargs) 542 543 self.max_allow_num = max_allowed_num 544 with open(dict_path, encoding="utf-8") as fp: 545 ng_words = fp.read().split("\n") 546 ng_words = [re.escape(w.strip()) for w in ng_words if not len(w) == 0] 547 pat = r"|".join(ng_words) 548 self.keyword_pat = re.compile(pat) 549 550 def apply(self, doc: Document) -> Document: 551 """ 552 >>> DiscardAds().apply(Document("お問い合わせください 営業時間 よくある質問"*5)).is_rejected 553 True 554 555 >>> DiscardAds().apply(Document("おはよう")).is_rejected 556 False 557 """ 558 ads_factor = self.keyword_pat.findall(doc.text) 559 if len(ads_factor) > self.max_allow_num: 560 doc.is_rejected = True 561 return doc 562 563 564class AcceptJapanese(Filter): 565 """ 566 日本語でないドキュメントを破棄します. 日本語判定は次の手順で行われます. 567 1. テキストを左から`lookup_size` (デフォルトで50字) 参照し, 568 ひらがな・カタカナが存在すれば日本語と判定する. 569 """ 570 571 def __init__(self, lookup_size: int = 50, *args: Any, **kwargs: Any) -> None: 572 super().__init__(*args, **kwargs) 573 574 self.lookup_size = lookup_size 575 self.hiragana_katakana_pat = re.compile(r"[ぁ-んァ-ン]") 576 577 def apply(self, doc: Document) -> Document: 578 """ 579 >>> AcceptJapanese().apply(Document("This is English document")).is_rejected 580 True 581 582 >>> AcceptJapanese().apply(Document("a"*50 + "あ")).is_rejected 583 True 584 585 >>> AcceptJapanese().apply(Document("ほうじ茶")).is_rejected 586 False 587 """ 588 if not self.hiragana_katakana_pat.search(doc.text[: self.lookup_size]): 589 doc.is_rejected = True 590 return doc 591 592 593class DiscardRareKuten(Filter): 594 """ 595 日本語でないドキュメントを破棄します. 日本語判定は次の手順で行われます 596 ドキュメントを句点"。"で区切り, 平均文長が 597 `max_avarage_sentence_length` より長い場合は破棄します. 598 `max_avarage_sentence_length` のデフォルト値は100です. 599 このフィルタは, 文章中の句点の割合が少なすぎるドキュメントを破棄します. 600 """ 601 602 def __init__(self, max_average_sentence_length: int = 100, *args: Any, **kwargs: Any) -> None: 603 super().__init__(*args, **kwargs) 604 605 self.max_average_sentence_length = max_average_sentence_length 606 self.kuten_pat = re.compile(r"。") 607 608 def apply(self, doc: Document) -> Document: 609 """ 610 >>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよ。")).is_rejected 611 False 612 >>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよう。")).is_rejected 613 True 614 """ 615 kuten_lst = self.kuten_pat.findall(doc.text) 616 min_kuten_num = len(doc.text) / self.max_average_sentence_length 617 if len(kuten_lst) < min_kuten_num: 618 doc.is_rejected = True 619 return doc 620 621 622class HeaderFooterTagsRemover(Filter): 623 """ 624 ドキュメントの冒頭・末尾のトークンを調査し, ヘッダー・フッダー的な 625 タグが存在していた場合, そのトークンを除去します. 626 627 このフィルタを通す前に, 事前にセンテンスレベルにトーカナイズしておいてください. 628 このフィルタでは Document.token にのみ変更が加えられるので, 出力前 あるいは 下流フィルタで 629 Document.text に変更を加える前にトークンをマージしておいてください. 630 """ 631 632 def __init__( 633 self, 634 dict_path: Union[str, PathLike] = BASE_PATH / "dict/header_footer_keywords_ja.txt", 635 *args: Any, 636 **kwargs: Any, 637 ) -> None: 638 super().__init__(*args, **kwargs) 639 640 with open(dict_path) as fp: 641 keywords = fp.read().split("\n") 642 keywords = [re.escape(w.strip()) for w in keywords if not len(w) == 0] 643 self.keyword_pat = re.compile(r"|".join(keywords)) 644 645 def apply(self, doc: Document) -> Document: 646 if len(doc.tokens) == 0: 647 return doc 648 649 lookup_size = 0 650 if 1 <= len(doc.tokens) < 4: 651 lookup_size = 1 652 elif 4 <= len(doc.tokens) < 6: 653 lookup_size = 2 654 elif 6 <= len(doc.tokens): 655 lookup_size = 3 656 657 for i in range(lookup_size): 658 if self.should_drop_token(doc.tokens[i]): 659 doc.tokens[i].is_rejected = True 660 if self.should_drop_token(doc.tokens[-(i + 1)]): 661 doc.tokens[i].is_rejected = True 662 663 return doc 664 665 def should_drop_token(self, token: Token) -> bool: 666 """ 667 >>> HeaderFooterTagsRemover().should_drop_token(Token("<TEST_STRING_OF_KEYWORD>")) 668 True 669 670 >>> HeaderFooterTagsRemover().should_drop_token(Token("ほうじ茶")) 671 False 672 673 Comment. 674 Original legacy code removed a pattern r"« _ | Main | _ »" . 675 In the pattern, "|" is not escaped, so **ANY** string was eliminated. 676 It seems unintended behavior, so I fix this. 677 """ 678 if self.keyword_pat.match(token.text): 679 return True 680 else: 681 return False 682 683 684class MaskPersonalInformation(Filter): 685 """ 686 ドキュメントに含まれる電話番号・電子メールアドレスを一部マスキングします. 687 """ 688 689 def __init__(self, *args: Any, **kwargs: Any) -> None: 690 super().__init__(*args, **kwargs) 691 692 self.phone_pat = re.compile( 693 r"((0|\+\d{1,3}[- ]?)(\d{2}[- ]?\d{4}[- ]?|\d[- ]?\d{4}[- ]?|\d{2}[- ]?\d{3}[- ]?|\d{3}[- ]?\d{2}[- ]?|\d{4}[- ]?\d{1}[- ]?))\d{4}" # noqa 694 ) 695 self.email_pat = re.compile( 696 r"[a-zA-Z0-9!#$%&'*+\-/=?^_`{|}~.]+@[A-Za-z0-9!#$%&'*+\-/=?^_`{|}~.]+(\.[A-Za-z0-9\-]+)" # noqa 697 ) 698 699 def apply(self, doc: Document) -> Document: 700 """ 701 >>> MaskPersonalInformation()('06-1234-5678') 702 '06-1234-XXXX' 703 >>> MaskPersonalInformation()('075-123-4567') 704 '075-123-XXXX' 705 >>> MaskPersonalInformation()('0166-12-3456') 706 '0166-12-XXXX' 707 >>> MaskPersonalInformation()('09808-1-2345') 708 '09808-1-XXXX' 709 >>> MaskPersonalInformation()('090-1234-5678') 710 '090-1234-XXXX' 711 >>> MaskPersonalInformation()('0751234567') 712 '075123XXXX' 713 >>> MaskPersonalInformation()('08012345678') 714 '0801234XXXX' 715 >>> MaskPersonalInformation()('連絡は075-123-4567 まで') 716 '連絡は075-123-XXXX まで' 717 >>> MaskPersonalInformation()('+81-80-1234-5678') 718 '+81-80-1234-XXXX' 719 >>> MaskPersonalInformation()('+818012345678') 720 '+81801234XXXX' 721 >>> MaskPersonalInformation()('hogehoge@example.com') 722 'xxxx@yyy.com' 723 >>> MaskPersonalInformation()('何かあれば hogehoge@example.ne.jp まで連絡') 724 '何かあれば xxxx@yyy.jp まで連絡' 725 """ 726 text = self.phone_pat.sub(r"\1XXXX", doc.text) 727 text = self.email_pat.sub(r"xxxx@yyy\1", text) 728 doc.text = text 729 return doc 730 731 732class DiscardTooManyNouns(Filter): 733 """ 734 [!CAUTION] This filter requires `fugashi` package. Please install it 735 by `pip install 'hojichar[all]'`. 736 737 A filter that removes document with too many nouns in Japanese i.e., 738 documents such as advertisement, word salad, etc ... 739 """ 740 741 def __init__( 742 self, threshold: float = 0.80, max_parse_chars: int = 100_000, *args: Any, **kwargs: Any 743 ) -> None: 744 """ 745 Args: 746 threshold: document whose noun ratio is higher than this value will be discarded 747 max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document. 748 *args: 749 **kwargs: 750 """ 751 super().__init__(*args, **kwargs) 752 assert is_loaded_extras, ( 753 "fugashi is required for this filter. Try pip install 'hojichar[all]'" 754 ) 755 756 self.threshold = threshold 757 self.max_parse_chars = max_parse_chars 758 self.tagger = Tagger("-Owakati") 759 assert "unidic" in self.tagger.dictionary_info[0]["filename"], ( 760 "MeCab dictionary must be unidic" 761 ) 762 763 def _chunk_text(self, text: str) -> Iterable[str]: 764 """Slice text into chunks of `max_parse_chars` length.""" 765 step = self.max_parse_chars 766 for i in range(0, len(text), step): 767 yield text[i : i + step] 768 769 def apply(self, doc: Document) -> Document: 770 """ 771 >>> DiscardTooManyNouns().apply(Document("自然言語処理大好き!")).is_rejected 772 False 773 >>> DiscardTooManyNouns().apply(Document("リンゴ・オレンジ・ミカン・バナナ セール中")).is_rejected 774 True 775 >>> DiscardTooManyNouns().apply(Document("今日の仙台朝市ではリンゴがセール中")).is_rejected 776 False 777 """ 778 # remove "補助記号" from part-of-speech statistics 779 # because they often decrease the noun ratio, 780 # e.g., the sentence "リンゴ・オレンジ・バナナ・" has 補助記号 ratio of 0.5 781 # however, we don't want such sentence 782 783 pos_count: Counter[str] = Counter() 784 for chunk in self._chunk_text(doc.text): 785 for word in self.tagger(chunk): 786 if word.feature.pos1 != "補助記号": 787 pos_count[word.feature.pos1] += 1 788 789 try: 790 noun_ratio = pos_count["名詞"] / sum(pos_count.values()) 791 except ZeroDivisionError: 792 noun_ratio = 0.0 793 if noun_ratio >= self.threshold: 794 doc.is_rejected = True 795 return doc 796 797 798class CharRepetitionRatioFilter(Filter): 799 """ 800 文字Ngramの重なり率(文書中で高頻度文字Ngramが占める割合)を計算して, 重なりの大きいものを除去します. 801 名詞の連続からなるような広告テキストを取り除くのに有効です. 802 803 実装は, BigScience で採用されていた前処理を参考にしています. 804 元実装: https://github.com/bigscience-workshop/data-preparation/blob/9d0588419073cc5bf0fb92b58f37f2a1016572c3/preprocessing/training/01b_oscar_cleaning_and_filtering/filtering.py#L425-L453 # noqa: E501 805 806 「高頻度文字Ngram」は、sqrt(ユニークなNgramの総数)によって求めていますが, 807 これは文書長の影響を軽減するためだとされています. 808 809 掲示板のテキストが引っかかりやすい傾向があります. 810 13: 名無しさん@実況で競馬板アウト 2019/08/18(日) 15:28:46.10 ID:eBvZg8h+0 811 的なものが高頻度で登場するため、文字Ngramの重なり率も高くなってしまう 812 """ 813 814 def __init__( 815 self, threshold: float = 0.33, ngram_size: int = 5, *args: Any, **kwargs: Any 816 ) -> None: 817 """ 818 819 Args: 820 threshold: document with character repetition ratio higher than this value will be discarded 821 ngram_size: character ngram size. Larger value will decrease the false positive of long documents 822 *args: 823 **kwargs: 824 """ # noqa: E501 825 826 super().__init__(*args, **kwargs) 827 self.threshold = threshold 828 self.ngram_size = ngram_size 829 830 def apply(self, doc: Document) -> Document: 831 ratio = self.compute_character_repetition_ratio(doc.text, self.ngram_size) 832 if ratio >= self.threshold: 833 doc.is_rejected = True 834 return doc 835 836 @staticmethod 837 def compute_character_repetition_ratio( 838 document: str, character_repetition_length: int 839 ) -> float: 840 def get_freq_character_ngrams(document: str, n: int) -> Dict[str, int]: 841 character_ngrams: List[str] = [ 842 document[i : i + n] for i in range(len(document) - n + 1) 843 ] 844 freq_character_ngrams_dict: Dict[str, int] = {} 845 for character_ngram in character_ngrams: 846 freq_character_ngrams_dict[character_ngram] = ( 847 freq_character_ngrams_dict.get(character_ngram, 0) + 1 848 ) 849 return freq_character_ngrams_dict 850 851 freq_character_ngrams_dict = get_freq_character_ngrams( 852 document, character_repetition_length 853 ) 854 if len(freq_character_ngrams_dict) == 0: 855 return 0.0 856 freq_character_ngrams: List[int] = list(freq_character_ngrams_dict.values()) 857 freq_character_ngrams = sorted(freq_character_ngrams, reverse=True) 858 val_one = len([el for el in freq_character_ngrams if el == 1]) 859 num_rep_character_ngrams = min( 860 int(np.sqrt(len(freq_character_ngrams))), 861 len(freq_character_ngrams) - val_one, 862 ) 863 character_repetition_ratio = sum(freq_character_ngrams[:num_rep_character_ngrams]) / sum( 864 freq_character_ngrams 865 ) 866 return character_repetition_ratio 867 868 869class WordRepetitionRatioFilter(Filter): 870 """ 871 [!CAUTION] This filter requires `fugashi` package. Please install it 872 by `pip install 'hojichar[all]'`. 873 874 単語Ngramの重なり率(文書中で重複する単語Ngramが占める割合)を計算して、重なりの大きいものを弾くためのフィルタ. 875 BigScienceで採用されていた前処理を参考にしている. 876 877 名詞が連打されているような広告テキストを取り除くのに有効な様子 878 まともな文書がたまたま2回繰り返されている場合もあり、これを取り除いて良いのかは分からない 879 例: 880 "ウェブ\n本文: ニコンの上昇率16%超える、今3月期は経常76%の大幅増益見込む(ニコン) 2013年05月10日[minkabu PRESS] - みんなの株式 (みんかぶ)\n2013/05/10(10:57) 881 ニコン<7731.T>が急騰、寄り付き直後に前日比355円高の2537円まで買い上げ 882 られ、上昇率は16%を超えた。外国為替市場で円が1ドル100円台、1ユーロ131円台に入るなど急速に円安が進み、輸出株が軒並み高になる 883 なか、9日取引終了後に発表した前年3月期決算で、今3月期は2ケタ近い増収で大幅増益を見込んだことが買い気を強めさせた。連結売上 884 高は前期比9.8%増の1兆1100億円、経常利益75.8%増の850億円を予想。前期は半導体、電子部品の低迷が足かせになり、2ケタ増収ながら 885 経常46%の大幅減益になったが、レンズ交換式デジタルカメラの拡大や液晶ディスプレイの回復で収益が急回復する。ニコンの株価は10時 886 56分現在2491円(△309円)出所:株経通信(株式会社みんかぶ)\n2013/05/10 - ニコン(7731) の関連ニュース。 ニコン<7731.T>が急騰、寄 887 り付き直後に前日比355円高の2537円まで買い上げられ、上昇率は16%を超えた。外国為替市場で円が1ドル100円台、1ユーロ131円台に入 888 るなど急速に円安が進み、輸出株が軒並み高になるなか、9日取引終了後に発表した前年3月期決算で、今3月期は2ケタ近い増収で大幅増 889 益を見込んだことが買い気を強めさせた。連結売上高は前期比9.8%増の1兆1100億円、経常利益75.8%増の850億円を予想。前期は半導体、 890 電子部品の低迷が足かせになり、2ケタ増収ながら経常46%の大幅減益になったが、レンズ交換式デジタルカメラの拡大や液晶ディスプレ 891 イの回復で収益が急回" 892 """ # noqa: E501 893 894 def __init__( 895 self, 896 threshold: float = 0.40, 897 ngram_size: int = 7, 898 max_parse_chars: int = 100_000, 899 *args: Any, 900 **kwargs: Any, 901 ) -> None: 902 """ 903 904 Args: 905 threshold: document whose character repetition ratio is higher than this value will be discarded 906 ngram_size: character ngram size. Larger value will decrease the false positive of long documents 907 max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document. 908 *args: 909 **kwargs: 910 """ # noqa: E501 911 super().__init__(*args, **kwargs) 912 assert is_loaded_extras, ( 913 "fugashi is required for this filter. Try pip install 'hojichar[all]'" 914 ) 915 916 self.threshold = threshold 917 self.ngram_size = ngram_size 918 self.max_parse_chars = max_parse_chars 919 self.tagger = Tagger("-Owakati") 920 921 def _chunk_text(self, text: str) -> Iterable[str]: 922 """Split text into chunks of `max_parse_chars` length.""" 923 step = self.max_parse_chars 924 for i in range(0, len(text), step): 925 yield text[i : i + step] 926 927 def _get_freq_word_ngrams(self, words: List[str], n: int) -> Dict[str, int]: 928 freq: Dict[str, int] = {} 929 if n <= 0 or len(words) < n: 930 return freq 931 for i in range(len(words) - n + 1): 932 key = " ".join(words[i : i + n]) 933 freq[key] = freq.get(key, 0) + 1 934 return freq 935 936 def apply(self, doc: Document) -> Document: 937 ratio = self.compute_word_repetition_ratio(doc.text, self.ngram_size) 938 if ratio >= self.threshold: 939 doc.is_rejected = True 940 return doc 941 942 def compute_word_repetition_ratio(self, document: str, n: int) -> float: 943 total_counter: Counter[str] = Counter() 944 945 for chunk in self._chunk_text(document): 946 words = [w.surface for w in self.tagger(chunk)] 947 total_counter.update(self._get_freq_word_ngrams(words, n)) 948 949 if not total_counter: 950 return 0.0 951 952 total = sum(total_counter.values()) 953 repeated = sum(v for v in total_counter.values() if v > 1) 954 return repeated / total 955 956 957class DiscardTooManySpecialToken(Filter): 958 """ 959 [!CAUTION] This filter requires `emoji` package. Please install it 960 by `pip install 'hojichar[all]'`. 961 962 句読点を含む記号、空白、絵文字、その他特殊な文字を一定の割合以上含むような文書を取り除くためのフィルタ 963 元実装: BigScience https://github.com/bigscience-workshop/data-preparation/blob/9d0588419073cc5bf0fb92b58f37f2a1016572c3/preprocessing/training/01b_oscar_cleaning_and_filtering/parameters_filtering.py#L5-L16 # noqa: E501 964 """ 965 966 def __init__(self, threshold: float = 0.4, *args: Any, **kwargs: Any) -> None: 967 """ 968 969 Args: 970 threshold: document whose special token ratio is higher than this value will be discarded 971 *args: 972 **kwargs: 973 """ # noqa: E501 974 super().__init__(*args, **kwargs) 975 976 # digits are not regarded as special tokens 977 # otherwise many false positives are made, i.e., good documents discarded 978 main_special_characters = string.punctuation + string.whitespace # + string.digits 979 other_special_characters = ( 980 " ’“”–▬…✦�£•€«»°·═" 981 "×士^˘⇓()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰ ‑≤≥‖" 982 "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†:⁄♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚" 983 "゜ʼ≖ʼ¤℃√!?【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖" 984 "」﴾》�" 985 ) 986 987 en_emoji = emoji.EMOJI_DATA.keys() 988 989 special_characters_default = set(main_special_characters + other_special_characters) 990 special_characters_default.update(en_emoji) 991 self.special_characters = special_characters_default 992 993 self.threshold = threshold 994 995 def _compute_special_characters_ratio(self, text: str) -> float: 996 if len(text) == 0: 997 return 0 998 999 special_characters_ratio = len( 1000 [char for char in text if char in self.special_characters] 1001 ) / len(text) 1002 return special_characters_ratio 1003 1004 def apply(self, doc: Document) -> Document: 1005 special_characters_ratio = self._compute_special_characters_ratio(doc.text) 1006 1007 if special_characters_ratio > self.threshold: 1008 doc.is_rejected = True 1009 return doc 1010 1011 1012class SingleCharacterRepetitionFilter(Filter): 1013 """ 1014 単一文字が大量に繰り返されているような文書を取り除くためのフィルタ 1015 そのような文書はノイズである可能性が高いため 1016 参考: BigScienceプロジェクトによると、oscarデータセットの中にバックスラッシュだけを2M個含むような文書が含まれていたらしい 1017 https://github.com/bigscience-workshop/bigscience/blob/master/train/tr8-104B-wide/chronicles.md#2m-backslash-only-samples-in-our-dataset # noqa: E501 1018 """ 1019 1020 def __init__( 1021 self, 1022 threshold: int = 200, 1023 *args: Any, 1024 **kwargs: Any, 1025 ) -> None: 1026 """ 1027 Args: 1028 threshold: The document is removed if character is repeated for this value or more 1029 *args: 1030 **kwargs: 1031 """ 1032 super().__init__(*args, **kwargs) 1033 self.threshold = threshold 1034 1035 def _is_repeat_contained(self, text: str) -> bool: 1036 groups = groupby(text) 1037 is_repeat_contained = any(sum(1 for _ in group) >= self.threshold for _, group in groups) 1038 return is_repeat_contained 1039 1040 def apply(self, doc: Document) -> Document: 1041 if self._is_repeat_contained(doc.text): 1042 doc.is_rejected = True 1043 return doc 1044 1045 1046class DiscardTooManyEndingEllipsis(Filter): 1047 """ 1048 ellipsisで終わるような行が大量に含まれるような文書を取り除くためのフィルタです. 1049 ellipsisとしては ... と … を用いている 1050 同様のフィルタが RedPajama v2で用いられています. 1051 1052 例として, 以下のような文書を検知します. 1053 ``` 1054 ペアーズは女性、という驚愕の過食が出ているのをごアラサーですか。時代から付... 1055 バツイチアラフォー 婚活ち女性の特徴と子持な付... 1056 ``` 1057 1058 デフォルトではしきい値を0.7としているが, これはC4から0.1%を削るような設定であり、 1059 precisionを重視した設定です. 1060 """ 1061 1062 def __init__( 1063 self, 1064 threshold: float = 0.7, 1065 *args: Any, 1066 **kwargs: Any, 1067 ) -> None: 1068 """ 1069 Args: 1070 threshold: The document is removed if ratio of lines ending with ellipsis is higher than this value 1071 *args: 1072 **kwargs: 1073 """ # noqa: E501 1074 super().__init__(*args, **kwargs) 1075 self.threshold = threshold 1076 self.ellipsis_pattern = re.compile(r"(\.{3}|…)\n") # matches ...\n and …\n 1077 1078 def apply(self, doc: Document) -> Document: 1079 ellipsis_count = len(self.ellipsis_pattern.findall(doc.text)) 1080 newline_count = max(doc.text.count("\n"), 1) # avoid zero division 1081 ellipsis_ratio = ellipsis_count / newline_count 1082 1083 if ellipsis_ratio > self.threshold: 1084 doc.is_rejected = True 1085 return doc 1086 1087 1088class DiscardTooShortLines(Filter): 1089 """ 1090 短い行を大量に含む文書を捨てるためのフィルタです. 1091 1092 メニューバーやパンくずリストのような要素を大量に含む文書を取り除くのに有効です. 1093 """ 1094 1095 def __init__(self, threshold: float = 0.5, *args: Any, **kwargs: Any) -> None: 1096 """ 1097 Args: 1098 threshold: The document is removed if the ratio of short (<10 chars) lines are more than this value. 1099 *args: 1100 **kwargs: 1101 """ # noqa: E501 1102 super().__init__(*args, **kwargs) 1103 self.threshold = threshold 1104 # この値は適当に決め打ち 1105 self.minimum_line_length = 10 1106 1107 def apply(self, doc: Document) -> Document: 1108 lines = [len(x) for x in doc.text.split("\n")] 1109 short_lines = [x for x in lines if x <= self.minimum_line_length] 1110 if (len(short_lines) / len(lines)) > self.threshold: 1111 doc.is_rejected = True 1112 return doc
32class ExampleHojiChar(Filter): 33 """基本的なフィルタの実装例です. 末尾に'<hojichar>'を追加します.""" 34 35 def apply(self, document: Document) -> Document: 36 """ 37 >>> ExampleHojiChar()("hello, world") 38 'hello, world<hojichar>' 39 """ 40 document.text += "<hojichar>" 41 return document
基本的なフィルタの実装例です. 末尾に'
44class ExampleDiscardDocumentContainKeyword(Filter): 45 """特定のキーワードを持つドキュメントを破棄するようなフィルタの実装例です.""" 46 47 def __init__(self, keyword: str, *args: Any, **kwargs: Any) -> None: 48 super().__init__(*args, **kwargs) 49 self.keyword = keyword 50 51 def apply(self, document: Document) -> Document: 52 """ 53 >>> ExampleDiscardDocumentContainKeyword("バカ").apply(Document("あいつはバカだ")).is_rejected 54 True 55 """ 56 if self.keyword in document.text: 57 document.is_rejected = True 58 return document
特定のキーワードを持つドキュメントを破棄するようなフィルタの実装例です.
47 def __init__(self, keyword: str, *args: Any, **kwargs: Any) -> None: 48 super().__init__(*args, **kwargs) 49 self.keyword = keyword
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p is 1, the filter will always be applied.
skip_rejected : bool
If True, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None, a new random number generator will be created.
If None, and use in the Compose class, the random state is shared with the Compose object.
use_batch : bool
If True, the filter will process documents in batches in the apply_stream method.
batch_size : int
The size of the batch to process documents in the apply_stream method.
kwargs : Any
Additional keyword arguments to pass to the filter.
51 def apply(self, document: Document) -> Document: 52 """ 53 >>> ExampleDiscardDocumentContainKeyword("バカ").apply(Document("あいつはバカだ")).is_rejected 54 True 55 """ 56 if self.keyword in document.text: 57 document.is_rejected = True 58 return document
>>> ExampleDiscardDocumentContainKeyword("バカ").apply(Document("あいつはバカだ")).is_rejected
True
61class Identity(Filter): 62 """何も変化を加えないフィルタです. テスト・デバッグに用いられます.""" 63 64 def apply(self, document: Document) -> Document: 65 return document
何も変化を加えないフィルタです. テスト・デバッグに用いられます.
Definition of filter behavior.
The document must have a protocol TextContent,
and mostly used hojichar.Document class.
In this method, the filter will modify document.text or
document.extras and set document.is_rejected = True to discard the document.
Parameters
document : Document Input document
Returns
Document Processed Document
68class DiscardAll(Filter): 69 """ 70 すべてのドキュメントを破棄するフィルタです. 71 テスト・デバッグに用いられます. 72 """ 73 74 def apply(self, document: Document) -> Document: 75 document.is_rejected = True 76 return document
すべてのドキュメントを破棄するフィルタです. テスト・デバッグに用いられます.
74 def apply(self, document: Document) -> Document: 75 document.is_rejected = True 76 return document
Definition of filter behavior.
The document must have a protocol TextContent,
and mostly used hojichar.Document class.
In this method, the filter will modify document.text or
document.extras and set document.is_rejected = True to discard the document.
Parameters
document : Document Input document
Returns
Document Processed Document
79class ApplyDiscard(Filter): 80 """ 81 上流フィルタで破棄された`Document`を空文字列にします. 82 83 `Document.is_rejected=True` の ドキュメントは無視されるため, 84 このフィルタを `Compose` のコンストラクタに渡しても動作しません. 85 このフィルタは主に`Compose` 内部や, `discard_filtered=False` を指定 86 したデバッグ時などに利用されます. 87 """ 88 89 def __init__(self, *args: Any, **kwargs: Any) -> None: 90 super().__init__(*args, **kwargs) 91 92 def apply(self, document: Document) -> Document: 93 """ 94 >>> ApplyDiscard().apply(Document(text="hello", is_rejected=True)).text 95 '' 96 """ 97 if document.is_rejected: 98 document.text = "" 99 100 return document
上流フィルタで破棄されたDocumentを空文字列にします.
Document.is_rejected=True の ドキュメントは無視されるため,
このフィルタを Compose のコンストラクタに渡しても動作しません.
このフィルタは主にCompose 内部や, discard_filtered=False を指定
したデバッグ時などに利用されます.
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p is 1, the filter will always be applied.
skip_rejected : bool
If True, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None, a new random number generator will be created.
If None, and use in the Compose class, the random state is shared with the Compose object.
use_batch : bool
If True, the filter will process documents in batches in the apply_stream method.
batch_size : int
The size of the batch to process documents in the apply_stream method.
kwargs : Any
Additional keyword arguments to pass to the filter.
103class Sleep(Filter): 104 """ 105 デバッグ用のフィルタです. 指定秒スリープします. 106 """ 107 108 def __init__(self, time: float = 1.0, *args: Any, **kwargs: Any) -> None: 109 super().__init__(*args, **kwargs) 110 self.time = time 111 112 def apply(self, document: Document) -> Document: 113 """ 114 >>> Sleep(0.1)('hello') # After 0.1 seconds, 115 'hello' 116 """ 117 time.sleep(self.time) 118 return document
デバッグ用のフィルタです. 指定秒スリープします.
108 def __init__(self, time: float = 1.0, *args: Any, **kwargs: Any) -> None: 109 super().__init__(*args, **kwargs) 110 self.time = time
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p is 1, the filter will always be applied.
skip_rejected : bool
If True, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None, a new random number generator will be created.
If None, and use in the Compose class, the random state is shared with the Compose object.
use_batch : bool
If True, the filter will process documents in batches in the apply_stream method.
batch_size : int
The size of the batch to process documents in the apply_stream method.
kwargs : Any
Additional keyword arguments to pass to the filter.
121class DocumentNormalizer(Filter): 122 """ 123 Unicode の正規化をします. 124 """ 125 126 def __init__(self, *args: Any, **kwargs: Any) -> None: 127 super().__init__(*args, **kwargs) 128 129 def apply(self, document: Document) -> Document: 130 document.text = unicodedata.normalize("NFKC", document.text) 131 return document
Unicode の正規化をします.
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p is 1, the filter will always be applied.
skip_rejected : bool
If True, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None, a new random number generator will be created.
If None, and use in the Compose class, the random state is shared with the Compose object.
use_batch : bool
If True, the filter will process documents in batches in the apply_stream method.
batch_size : int
The size of the batch to process documents in the apply_stream method.
kwargs : Any
Additional keyword arguments to pass to the filter.
129 def apply(self, document: Document) -> Document: 130 document.text = unicodedata.normalize("NFKC", document.text) 131 return document
Definition of filter behavior.
The document must have a protocol TextContent,
and mostly used hojichar.Document class.
In this method, the filter will modify document.text or
document.extras and set document.is_rejected = True to discard the document.
Parameters
document : Document Input document
Returns
Document Processed Document
134class JSONLoader(Filter): 135 """ 136 テキストを Json として解釈し, `key` で指定した要素を文字列として 137 doument に格納します.デフォルトの `key` は 'text' です. 138 139 Json の読み込み, あるいは `key` の読み込みに失敗した際には例外を送出します. 140 これらを無視する場合は, `ignore=True` にします. その際, 読み込みに失敗 141 したドキュメントは破棄されます. 142 143 入力 Json に `extras` キー(辞書形式)が含まれている場合, Document.extras に自動的にマージされます。 144 さらに `extra_keys` でフィールドを指定すると, それらの値も Document.extras に追記され, 既存の extras 145 を上書きせずに統合できます。 146 """ 147 148 def __init__( 149 self, 150 key: str = "text", 151 ignore: bool = False, 152 extra_keys: Optional[List[str]] = None, 153 *args: Any, 154 **kwargs: Any, 155 ) -> None: 156 super().__init__(*args, **kwargs) 157 self.key = key 158 self.ignore = ignore 159 self.extra_keys = extra_keys 160 161 def apply(self, document: Document) -> Document: 162 """ 163 >>> JSONLoader()( '{"text": "hello, world", "words": 2}' ) 164 'hello, world' 165 166 >>> JSONLoader()( '{"text": hello, world ....' ) # Broken JSON 167 Traceback (most recent call last): 168 ... 169 json.decoder.JSONDecodeError: Expecting value: line 1 column 10 (char 9) 170 171 >>> JSONLoader()( '{"words": 2}' ) 172 Traceback (most recent call last): 173 ... 174 KeyError: 'text' 175 176 >>> JSONLoader(ignore=True).apply(Document('{"text": hello, world ....' )).is_rejected 177 True 178 """ 179 try: 180 data = json.loads(document.text) 181 document.text = str(data[self.key]) 182 if "extras" in data and isinstance(data["extras"], dict): 183 document.extras.update(data["extras"]) 184 if self.extra_keys is not None: 185 for key in self.extra_keys: 186 if key not in data: 187 continue 188 if key == "extras" and isinstance(data[key], dict): 189 document.extras.update(data[key]) 190 else: 191 document.extras[key] = data[key] 192 except Exception as e: 193 logger.error(f"Failed to parsing in JSONLoader. Input document: \n{document.text}") 194 if self.ignore: 195 document.is_rejected = True 196 return document 197 else: 198 raise e 199 200 return document
テキストを Json として解釈し, key で指定した要素を文字列として
doument に格納します.デフォルトの key は 'text' です.
Json の読み込み, あるいは key の読み込みに失敗した際には例外を送出します.
これらを無視する場合は, ignore=True にします. その際, 読み込みに失敗
したドキュメントは破棄されます.
入力 Json に extras キー(辞書形式)が含まれている場合, Document.extras に自動的にマージされます。
さらに extra_keys でフィールドを指定すると, それらの値も Document.extras に追記され, 既存の extras
を上書きせずに統合できます。
148 def __init__( 149 self, 150 key: str = "text", 151 ignore: bool = False, 152 extra_keys: Optional[List[str]] = None, 153 *args: Any, 154 **kwargs: Any, 155 ) -> None: 156 super().__init__(*args, **kwargs) 157 self.key = key 158 self.ignore = ignore 159 self.extra_keys = extra_keys
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p is 1, the filter will always be applied.
skip_rejected : bool
If True, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None, a new random number generator will be created.
If None, and use in the Compose class, the random state is shared with the Compose object.
use_batch : bool
If True, the filter will process documents in batches in the apply_stream method.
batch_size : int
The size of the batch to process documents in the apply_stream method.
kwargs : Any
Additional keyword arguments to pass to the filter.
161 def apply(self, document: Document) -> Document: 162 """ 163 >>> JSONLoader()( '{"text": "hello, world", "words": 2}' ) 164 'hello, world' 165 166 >>> JSONLoader()( '{"text": hello, world ....' ) # Broken JSON 167 Traceback (most recent call last): 168 ... 169 json.decoder.JSONDecodeError: Expecting value: line 1 column 10 (char 9) 170 171 >>> JSONLoader()( '{"words": 2}' ) 172 Traceback (most recent call last): 173 ... 174 KeyError: 'text' 175 176 >>> JSONLoader(ignore=True).apply(Document('{"text": hello, world ....' )).is_rejected 177 True 178 """ 179 try: 180 data = json.loads(document.text) 181 document.text = str(data[self.key]) 182 if "extras" in data and isinstance(data["extras"], dict): 183 document.extras.update(data["extras"]) 184 if self.extra_keys is not None: 185 for key in self.extra_keys: 186 if key not in data: 187 continue 188 if key == "extras" and isinstance(data[key], dict): 189 document.extras.update(data[key]) 190 else: 191 document.extras[key] = data[key] 192 except Exception as e: 193 logger.error(f"Failed to parsing in JSONLoader. Input document: \n{document.text}") 194 if self.ignore: 195 document.is_rejected = True 196 return document 197 else: 198 raise e 199 200 return document
>>> JSONLoader()( '{"text": "hello, world", "words": 2}' )
'hello, world'
>>> JSONLoader()( '{"text": hello, world ....' ) # Broken JSON
Traceback (most recent call last):
...
json.decoder.JSONDecodeError: Expecting value: line 1 column 10 (char 9)
>>> JSONLoader()( '{"words": 2}' )
Traceback (most recent call last):
...
KeyError: 'text'
>>> JSONLoader(ignore=True).apply(Document('{"text": hello, world ....' )).is_rejected
True
203class JSONDumper(Filter): 204 """ 205 Document.text の文字列を json に変換します. 206 必要に応じ Document のメタデータを付与します. これはドキュメントの破棄事由が含まれ、偽陽性の分析に有効です。 207 デフォルトで `skip_rejected` が `False` にセットされており、Document の破棄フラグにかかわらず 208 処理されます。 209 """ 210 211 def __init__( 212 self, 213 dump_reason: bool = False, 214 p: float = 1, 215 skip_rejected: bool = False, 216 export_extras: bool = False, 217 *args: Any, 218 **kwargs: Any, 219 ) -> None: 220 """ 221 Args: 222 dump_reason (bool, optional): `is_rejected`, `reason` エントリをダンプします. Defaults to False. 223 p (float, optional): Apply probability. Defaults to 1. 224 skip_rejected (bool, optional): 破棄済みサンプルを排除しません. 225 """ 226 super().__init__(p, skip_rejected, *args, **kwargs) 227 self.dump_reason = dump_reason 228 self.export_extras = export_extras 229 230 def apply(self, document: Document) -> Document: 231 """ 232 >>> JSONDumper()("hojichar") 233 '{"text": "hojichar"}' 234 """ 235 text = document.text 236 if self.dump_reason: 237 if self.export_extras: 238 output_extras = dict(document.extras) 239 document.text = json.dumps( 240 { 241 "text": text, 242 "is_rejected": document.is_rejected, 243 "reason": document.reject_reason, 244 "extras": output_extras, 245 }, 246 ensure_ascii=False, 247 ) 248 else: 249 document.text = json.dumps( 250 { 251 "text": text, 252 "is_rejected": document.is_rejected, 253 "reason": document.reject_reason, 254 }, 255 ensure_ascii=False, 256 ) 257 else: 258 if self.export_extras: 259 output_extras = dict(document.extras) 260 document.text = json.dumps( 261 { 262 "text": text, 263 "extras": output_extras, 264 }, 265 ensure_ascii=False, 266 ) 267 else: 268 document.text = json.dumps({"text": text}, ensure_ascii=False) 269 return document
Document.text の文字列を json に変換します.
必要に応じ Document のメタデータを付与します. これはドキュメントの破棄事由が含まれ、偽陽性の分析に有効です。
デフォルトで skip_rejected が False にセットされており、Document の破棄フラグにかかわらず
処理されます。
211 def __init__( 212 self, 213 dump_reason: bool = False, 214 p: float = 1, 215 skip_rejected: bool = False, 216 export_extras: bool = False, 217 *args: Any, 218 **kwargs: Any, 219 ) -> None: 220 """ 221 Args: 222 dump_reason (bool, optional): `is_rejected`, `reason` エントリをダンプします. Defaults to False. 223 p (float, optional): Apply probability. Defaults to 1. 224 skip_rejected (bool, optional): 破棄済みサンプルを排除しません. 225 """ 226 super().__init__(p, skip_rejected, *args, **kwargs) 227 self.dump_reason = dump_reason 228 self.export_extras = export_extras
Args:
dump_reason (bool, optional): is_rejected, reason エントリをダンプします. Defaults to False.
p (float, optional): Apply probability. Defaults to 1.
skip_rejected (bool, optional): 破棄済みサンプルを排除しません.
230 def apply(self, document: Document) -> Document: 231 """ 232 >>> JSONDumper()("hojichar") 233 '{"text": "hojichar"}' 234 """ 235 text = document.text 236 if self.dump_reason: 237 if self.export_extras: 238 output_extras = dict(document.extras) 239 document.text = json.dumps( 240 { 241 "text": text, 242 "is_rejected": document.is_rejected, 243 "reason": document.reject_reason, 244 "extras": output_extras, 245 }, 246 ensure_ascii=False, 247 ) 248 else: 249 document.text = json.dumps( 250 { 251 "text": text, 252 "is_rejected": document.is_rejected, 253 "reason": document.reject_reason, 254 }, 255 ensure_ascii=False, 256 ) 257 else: 258 if self.export_extras: 259 output_extras = dict(document.extras) 260 document.text = json.dumps( 261 { 262 "text": text, 263 "extras": output_extras, 264 }, 265 ensure_ascii=False, 266 ) 267 else: 268 document.text = json.dumps({"text": text}, ensure_ascii=False) 269 return document
>>> JSONDumper()("hojichar")
'{"text": "hojichar"}'
272class DocumentLengthFilter(Filter): 273 """ 274 `min_doc_len`, `max_doc_len` で指定した上限・下限の範囲内にないドキュメントを破棄します. 275 デフォルトでは 200字 以上 50000字以内のテキストが受理されます. 276 """ 277 278 def __init__( 279 self, 280 min_doc_len: Optional[int] = None, 281 max_doc_len: Optional[int] = None, 282 *args: Any, 283 **kwargs: Any, 284 ) -> None: 285 super().__init__(*args, **kwargs) 286 287 self.min_doc_len = min_doc_len 288 self.max_doc_len = max_doc_len 289 290 def apply(self, doc: Document) -> Document: 291 """ 292 >>> DocumentLengthFilter(min_doc_len=5).apply(Document("1234")).is_rejected 293 True 294 """ 295 doc_len = len(doc.text) 296 if self.min_doc_len is not None: 297 if doc_len < self.min_doc_len: 298 doc.is_rejected = True 299 if self.max_doc_len is not None: 300 if self.max_doc_len < doc_len: 301 doc.is_rejected = True 302 return doc
min_doc_len, max_doc_len で指定した上限・下限の範囲内にないドキュメントを破棄します.
デフォルトでは 200字 以上 50000字以内のテキストが受理されます.
278 def __init__( 279 self, 280 min_doc_len: Optional[int] = None, 281 max_doc_len: Optional[int] = None, 282 *args: Any, 283 **kwargs: Any, 284 ) -> None: 285 super().__init__(*args, **kwargs) 286 287 self.min_doc_len = min_doc_len 288 self.max_doc_len = max_doc_len
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p is 1, the filter will always be applied.
skip_rejected : bool
If True, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None, a new random number generator will be created.
If None, and use in the Compose class, the random state is shared with the Compose object.
use_batch : bool
If True, the filter will process documents in batches in the apply_stream method.
batch_size : int
The size of the batch to process documents in the apply_stream method.
kwargs : Any
Additional keyword arguments to pass to the filter.
290 def apply(self, doc: Document) -> Document: 291 """ 292 >>> DocumentLengthFilter(min_doc_len=5).apply(Document("1234")).is_rejected 293 True 294 """ 295 doc_len = len(doc.text) 296 if self.min_doc_len is not None: 297 if doc_len < self.min_doc_len: 298 doc.is_rejected = True 299 if self.max_doc_len is not None: 300 if self.max_doc_len < doc_len: 301 doc.is_rejected = True 302 return doc
>>> DocumentLengthFilter(min_doc_len=5).apply(Document("1234")).is_rejected
True
305class NgWordsFilterJa(Filter): 306 """ 307 日本語のNGワード(および不適切語)を含む文書を破棄します. 308 `dict_path` で指定したファイルから, キーワードのリストを得ます. 309 ファイルは単語が改行で羅列されたテキストファイルです. 310 311 `ignore_confused` を `True` にすると, 312 偽陽性を軽減するために, カタカナのNGワードは前後にカタカナが無い場合のみNG判定されます. 313 デフォルト値は `False` です. 314 """ 315 316 def __init__( 317 self, 318 dict_path: Union[str, PathLike], 319 ignore_confused: bool = False, 320 *args: Any, 321 **kwargs: Any, 322 ) -> None: 323 super().__init__(*args, **kwargs) 324 325 with open(dict_path, encoding="utf-8") as fp: 326 ng_words = fp.read().split("\n") 327 ng_words = [w.strip() for w in ng_words if not len(w) == 0] 328 329 if ignore_confused: 330 words_katakana = [] 331 words_not_katakana = [] 332 for w in ng_words: 333 if re.fullmatch(r"[ァ-ヴー]+", w): 334 words_katakana.append(re.escape(w)) 335 else: 336 words_not_katakana.append(re.escape(w)) 337 katakana_pat = "|".join(words_katakana) 338 katakana_pat = rf"(?<![ァ-ヴー])({katakana_pat})(?![ァ-ヴー])" 339 pat = "|".join(words_not_katakana) + "|" + katakana_pat 340 self.keyword_pat = re.compile(pat) 341 else: 342 ng_words = [re.escape(w) for w in ng_words] 343 pat = "|".join(ng_words) 344 self.keyword_pat = re.compile(pat) 345 346 def apply(self, doc: Document) -> Document: 347 regex_match = self.keyword_pat.search(doc.text) 348 if regex_match: 349 doc.is_rejected = True 350 self.matched_text = regex_match.group() 351 self.matched_text_neighbor = doc.text[ 352 regex_match.start() - 20 : regex_match.end() + 20 353 ] 354 355 return doc
日本語のNGワード(および不適切語)を含む文書を破棄します.
dict_path で指定したファイルから, キーワードのリストを得ます.
ファイルは単語が改行で羅列されたテキストファイルです.
ignore_confused を True にすると,
偽陽性を軽減するために, カタカナのNGワードは前後にカタカナが無い場合のみNG判定されます.
デフォルト値は False です.
316 def __init__( 317 self, 318 dict_path: Union[str, PathLike], 319 ignore_confused: bool = False, 320 *args: Any, 321 **kwargs: Any, 322 ) -> None: 323 super().__init__(*args, **kwargs) 324 325 with open(dict_path, encoding="utf-8") as fp: 326 ng_words = fp.read().split("\n") 327 ng_words = [w.strip() for w in ng_words if not len(w) == 0] 328 329 if ignore_confused: 330 words_katakana = [] 331 words_not_katakana = [] 332 for w in ng_words: 333 if re.fullmatch(r"[ァ-ヴー]+", w): 334 words_katakana.append(re.escape(w)) 335 else: 336 words_not_katakana.append(re.escape(w)) 337 katakana_pat = "|".join(words_katakana) 338 katakana_pat = rf"(?<![ァ-ヴー])({katakana_pat})(?![ァ-ヴー])" 339 pat = "|".join(words_not_katakana) + "|" + katakana_pat 340 self.keyword_pat = re.compile(pat) 341 else: 342 ng_words = [re.escape(w) for w in ng_words] 343 pat = "|".join(ng_words) 344 self.keyword_pat = re.compile(pat)
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p is 1, the filter will always be applied.
skip_rejected : bool
If True, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None, a new random number generator will be created.
If None, and use in the Compose class, the random state is shared with the Compose object.
use_batch : bool
If True, the filter will process documents in batches in the apply_stream method.
batch_size : int
The size of the batch to process documents in the apply_stream method.
kwargs : Any
Additional keyword arguments to pass to the filter.
346 def apply(self, doc: Document) -> Document: 347 regex_match = self.keyword_pat.search(doc.text) 348 if regex_match: 349 doc.is_rejected = True 350 self.matched_text = regex_match.group() 351 self.matched_text_neighbor = doc.text[ 352 regex_match.start() - 20 : regex_match.end() + 20 353 ] 354 355 return doc
Definition of filter behavior.
The document must have a protocol TextContent,
and mostly used hojichar.Document class.
In this method, the filter will modify document.text or
document.extras and set document.is_rejected = True to discard the document.
Parameters
document : Document Input document
Returns
Document Processed Document
358class NgWordsFilterEn(Filter): 359 """ 360 英語のNGワード(および不適切語)を含む文書を破棄します. 361 `dict_path` で指定したファイルから, キーワードのリストを得ます. 362 ファイルは単語が改行で羅列されたテキストファイルです. 363 """ 364 365 def __init__(self, dict_path: Union[str, PathLike], *args: Any, **kwargs: Any) -> None: 366 super().__init__(*args, **kwargs) 367 368 with open(dict_path, encoding="utf-8") as fp: 369 ng_words = fp.read().split("\n") 370 ng_words = [re.escape(w.strip()) for w in ng_words if not len(w) == 0] 371 pat = "|".join(ng_words) 372 # 英語のパターンにマッチするようにしている, \s[単語]\s や [単語]. [単語], などにマッチ. 373 self.keyword_pat = re.compile(rf"(?:^| )({pat})(?:( |,|\.)|$)", re.IGNORECASE) 374 375 def apply(self, doc: Document) -> Document: 376 if self.keyword_pat.search(doc.text): 377 doc.is_rejected = True 378 return doc
英語のNGワード(および不適切語)を含む文書を破棄します.
dict_path で指定したファイルから, キーワードのリストを得ます.
ファイルは単語が改行で羅列されたテキストファイルです.
365 def __init__(self, dict_path: Union[str, PathLike], *args: Any, **kwargs: Any) -> None: 366 super().__init__(*args, **kwargs) 367 368 with open(dict_path, encoding="utf-8") as fp: 369 ng_words = fp.read().split("\n") 370 ng_words = [re.escape(w.strip()) for w in ng_words if not len(w) == 0] 371 pat = "|".join(ng_words) 372 # 英語のパターンにマッチするようにしている, \s[単語]\s や [単語]. [単語], などにマッチ. 373 self.keyword_pat = re.compile(rf"(?:^| )({pat})(?:( |,|\.)|$)", re.IGNORECASE)
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p is 1, the filter will always be applied.
skip_rejected : bool
If True, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None, a new random number generator will be created.
If None, and use in the Compose class, the random state is shared with the Compose object.
use_batch : bool
If True, the filter will process documents in batches in the apply_stream method.
batch_size : int
The size of the batch to process documents in the apply_stream method.
kwargs : Any
Additional keyword arguments to pass to the filter.
375 def apply(self, doc: Document) -> Document: 376 if self.keyword_pat.search(doc.text): 377 doc.is_rejected = True 378 return doc
Definition of filter behavior.
The document must have a protocol TextContent,
and mostly used hojichar.Document class.
In this method, the filter will modify document.text or
document.extras and set document.is_rejected = True to discard the document.
Parameters
document : Document Input document
Returns
Document Processed Document
381class DiscardAdultContentJa(NgWordsFilterJa): 382 """ 383 日本語のアダルトキーワード(および不適切語)を含む文書を破棄します. 384 `dict_path` で指定したファイルから, キーワードのリストを得ます. 385 ファイルは単語が改行で羅列されたテキストファイルです. 386 デフォルトの`dict_path` は /hojichar/dict/adult_keywords_ja.txt です. 387 """ 388 389 def __init__( 390 self, 391 dict_path: Union[str, PathLike] = BASE_PATH / "dict/adult_keywords_ja.txt", 392 *args: Any, 393 **kwargs: Any, 394 ) -> None: 395 super().__init__(dict_path, *args, **kwargs) 396 397 def apply(self, doc: Document) -> Document: 398 """ 399 >>> DiscardAdultContentJa().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected 400 True 401 402 >>> DiscardAdultContentJa().apply(Document("ほうじ茶")).is_rejected 403 False 404 405 挙動は正しいが誤検知しているケース. 他にも, サック in リュックサック, 406 >>> DiscardAdultContentJa().apply(Document("アスパラガス")).is_rejected \ 407 # Matching with NG keyword "アス" 408 True 409 """ 410 return super().apply(doc)
日本語のアダルトキーワード(および不適切語)を含む文書を破棄します.
dict_path で指定したファイルから, キーワードのリストを得ます.
ファイルは単語が改行で羅列されたテキストファイルです.
デフォルトのdict_path は /hojichar/dict/adult_keywords_ja.txt です.
389 def __init__( 390 self, 391 dict_path: Union[str, PathLike] = BASE_PATH / "dict/adult_keywords_ja.txt", 392 *args: Any, 393 **kwargs: Any, 394 ) -> None: 395 super().__init__(dict_path, *args, **kwargs)
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p is 1, the filter will always be applied.
skip_rejected : bool
If True, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None, a new random number generator will be created.
If None, and use in the Compose class, the random state is shared with the Compose object.
use_batch : bool
If True, the filter will process documents in batches in the apply_stream method.
batch_size : int
The size of the batch to process documents in the apply_stream method.
kwargs : Any
Additional keyword arguments to pass to the filter.
397 def apply(self, doc: Document) -> Document: 398 """ 399 >>> DiscardAdultContentJa().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected 400 True 401 402 >>> DiscardAdultContentJa().apply(Document("ほうじ茶")).is_rejected 403 False 404 405 挙動は正しいが誤検知しているケース. 他にも, サック in リュックサック, 406 >>> DiscardAdultContentJa().apply(Document("アスパラガス")).is_rejected \ 407 # Matching with NG keyword "アス" 408 True 409 """ 410 return super().apply(doc)
>>> DiscardAdultContentJa().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected
True
>>> DiscardAdultContentJa().apply(Document("ほうじ茶")).is_rejected
False
挙動は正しいが誤検知しているケース. 他にも, サック in リュックサック,
>>> DiscardAdultContentJa().apply(Document("アスパラガス")).is_rejected # Matching with NG keyword "アス"
True
413class DiscardAdultContentEn(NgWordsFilterEn): 414 """ 415 英語のアダルトキーワード(および不適切語)を含む文書を破棄します. 416 `dict_path` で指定したファイルから, キーワードのリストを得ます. 417 ファイルは単語が改行で羅列されたテキストファイルです. 418 デフォルトの`dict_path` は /hojichar/dict/adult_keywords_en.txt です. 419 """ 420 421 def __init__( 422 self, 423 dict_path: Union[str, PathLike] = BASE_PATH / "dict/adult_keywords_en.txt", 424 *args: Any, 425 **kwargs: Any, 426 ) -> None: 427 super().__init__(dict_path, *args, **kwargs) 428 429 def apply(self, doc: Document) -> Document: 430 """ 431 >>> DiscardAdultContentEn().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected 432 True 433 434 >>> DiscardAdultContentEn().apply(Document("hojichar")).is_rejected 435 False 436 """ 437 return super().apply(doc)
英語のアダルトキーワード(および不適切語)を含む文書を破棄します.
dict_path で指定したファイルから, キーワードのリストを得ます.
ファイルは単語が改行で羅列されたテキストファイルです.
デフォルトのdict_path は /hojichar/dict/adult_keywords_en.txt です.
421 def __init__( 422 self, 423 dict_path: Union[str, PathLike] = BASE_PATH / "dict/adult_keywords_en.txt", 424 *args: Any, 425 **kwargs: Any, 426 ) -> None: 427 super().__init__(dict_path, *args, **kwargs)
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p is 1, the filter will always be applied.
skip_rejected : bool
If True, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None, a new random number generator will be created.
If None, and use in the Compose class, the random state is shared with the Compose object.
use_batch : bool
If True, the filter will process documents in batches in the apply_stream method.
batch_size : int
The size of the batch to process documents in the apply_stream method.
kwargs : Any
Additional keyword arguments to pass to the filter.
429 def apply(self, doc: Document) -> Document: 430 """ 431 >>> DiscardAdultContentEn().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected 432 True 433 434 >>> DiscardAdultContentEn().apply(Document("hojichar")).is_rejected 435 False 436 """ 437 return super().apply(doc)
>>> DiscardAdultContentEn().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected
True
>>> DiscardAdultContentEn().apply(Document("hojichar")).is_rejected
False
440class DiscardDiscriminationContentJa(NgWordsFilterJa): 441 """ 442 日本語の差別キーワード(および不適切語)を含む文書を破棄します. 443 `dict_path` で指定したファイルから, キーワードのリストを得ます. 444 ファイルは単語が改行で羅列されたテキストファイルです. 445 デフォルトの`dict_path` は /hojichar/dict/discrimination_keywords_ja.txt です. 446 """ 447 448 def __init__( 449 self, 450 dict_path: Union[str, PathLike] = BASE_PATH / "dict/discrimination_keywords_ja.txt", 451 *args: Any, 452 **kwargs: Any, 453 ): 454 super().__init__(dict_path, *args, **kwargs) 455 456 def apply(self, doc: Document) -> Document: 457 """ 458 >>> DiscardDiscriminationContentJa().\ 459 apply(Document("<TEST_STRING_OF_DISCRIMINATION_KEYWORD>")).is_rejected 460 True 461 462 >>> DiscardDiscriminationContentJa().apply(Document("ほうじ茶")).is_rejected 463 False 464 """ 465 return super().apply(doc)
日本語の差別キーワード(および不適切語)を含む文書を破棄します.
dict_path で指定したファイルから, キーワードのリストを得ます.
ファイルは単語が改行で羅列されたテキストファイルです.
デフォルトのdict_path は /hojichar/dict/discrimination_keywords_ja.txt です.
448 def __init__( 449 self, 450 dict_path: Union[str, PathLike] = BASE_PATH / "dict/discrimination_keywords_ja.txt", 451 *args: Any, 452 **kwargs: Any, 453 ): 454 super().__init__(dict_path, *args, **kwargs)
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p is 1, the filter will always be applied.
skip_rejected : bool
If True, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None, a new random number generator will be created.
If None, and use in the Compose class, the random state is shared with the Compose object.
use_batch : bool
If True, the filter will process documents in batches in the apply_stream method.
batch_size : int
The size of the batch to process documents in the apply_stream method.
kwargs : Any
Additional keyword arguments to pass to the filter.
456 def apply(self, doc: Document) -> Document: 457 """ 458 >>> DiscardDiscriminationContentJa().\ 459 apply(Document("<TEST_STRING_OF_DISCRIMINATION_KEYWORD>")).is_rejected 460 True 461 462 >>> DiscardDiscriminationContentJa().apply(Document("ほうじ茶")).is_rejected 463 False 464 """ 465 return super().apply(doc)
>>> DiscardDiscriminationContentJa(). apply(Document("<TEST_STRING_OF_DISCRIMINATION_KEYWORD>")).is_rejected
True
>>> DiscardDiscriminationContentJa().apply(Document("ほうじ茶")).is_rejected
False
468class DiscardViolenceContentJa(NgWordsFilterJa): 469 """ 470 日本語の暴力・脅迫を示唆するキーワードを含む文書を破棄します. 471 `dict_path` で指定したファイルから, キーワードのリストを得ます. 472 ファイルは単語が改行で羅列されたテキストファイルです. 473 デフォルトの`dict_path` は /hojichar/dict/violence_keywords_ja.txt です. 474 """ 475 476 def __init__( 477 self, 478 dict_path: Union[str, PathLike] = BASE_PATH / "dict/violence_keywords_ja.txt", 479 *args: Any, 480 **kwargs: Any, 481 ) -> None: 482 super().__init__(dict_path, *args, **kwargs) 483 484 def apply(self, doc: Document) -> Document: 485 """ 486 >>> DiscardViolenceContentJa()\ 487 .apply(Document("<TEST_STRING_OF_VIOLENCE_KEYWORD>")).is_rejected 488 True 489 490 >>> DiscardViolenceContentJa().apply(Document("ほうじ茶")).is_rejected 491 False 492 """ 493 return super().apply(doc)
日本語の暴力・脅迫を示唆するキーワードを含む文書を破棄します.
dict_path で指定したファイルから, キーワードのリストを得ます.
ファイルは単語が改行で羅列されたテキストファイルです.
デフォルトのdict_path は /hojichar/dict/violence_keywords_ja.txt です.
476 def __init__( 477 self, 478 dict_path: Union[str, PathLike] = BASE_PATH / "dict/violence_keywords_ja.txt", 479 *args: Any, 480 **kwargs: Any, 481 ) -> None: 482 super().__init__(dict_path, *args, **kwargs)
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p is 1, the filter will always be applied.
skip_rejected : bool
If True, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None, a new random number generator will be created.
If None, and use in the Compose class, the random state is shared with the Compose object.
use_batch : bool
If True, the filter will process documents in batches in the apply_stream method.
batch_size : int
The size of the batch to process documents in the apply_stream method.
kwargs : Any
Additional keyword arguments to pass to the filter.
484 def apply(self, doc: Document) -> Document: 485 """ 486 >>> DiscardViolenceContentJa()\ 487 .apply(Document("<TEST_STRING_OF_VIOLENCE_KEYWORD>")).is_rejected 488 True 489 490 >>> DiscardViolenceContentJa().apply(Document("ほうじ茶")).is_rejected 491 False 492 """ 493 return super().apply(doc)
>>> DiscardViolenceContentJa() .apply(Document("<TEST_STRING_OF_VIOLENCE_KEYWORD>")).is_rejected
True
>>> DiscardViolenceContentJa().apply(Document("ほうじ茶")).is_rejected
False
496class DiscardBBSComments(Filter): 497 """ 498 正規表現 "BBS Pattern" に `max_allow_num` 回よりたくさんマッチする文書を破棄します. 499 `max_allow_num` のデフォルト値は14です. 500 正規表現 "BBS Pattern" は下記のリンクで検証可能です. 501 https://regex101.com/r/ybQvL2/1 502 """ 503 504 def __init__(self, max_allowed_num: int = 14, *args: Any, **kwargs: Any) -> None: 505 super().__init__(*args, **kwargs) 506 507 self.max_allowed_num = max_allowed_num 508 self.keyword_pat = re.compile( 509 r"\d{4}[年\.\-\/][\ ]*\d{1,2}[月\.\-\/][\ ]*\d{1,2}[日]*|コメント|SOLD OUT|レビュー|投稿|ページ|\([月火水木金土日]\)|質問|\d+話|楽天市場|-" # noqa 510 ) 511 512 def apply(self, doc: Document) -> Document: 513 """ 514 >>> DiscardBBSComments().apply(Document("楽天市場 質問 投稿 コメント レビュー "*3)).is_rejected 515 True 516 517 >>> DiscardBBSComments().apply(Document("鏡餅")).is_rejected 518 False 519 """ 520 bbs_factor = self.keyword_pat.findall(doc.text) 521 if len(bbs_factor) > self.max_allowed_num: 522 doc.is_rejected = True 523 return doc
正規表現 "BBS Pattern" に max_allow_num 回よりたくさんマッチする文書を破棄します.
max_allow_num のデフォルト値は14です.
正規表現 "BBS Pattern" は下記のリンクで検証可能です.
https://regex101.com/r/ybQvL2/1
504 def __init__(self, max_allowed_num: int = 14, *args: Any, **kwargs: Any) -> None: 505 super().__init__(*args, **kwargs) 506 507 self.max_allowed_num = max_allowed_num 508 self.keyword_pat = re.compile( 509 r"\d{4}[年\.\-\/][\ ]*\d{1,2}[月\.\-\/][\ ]*\d{1,2}[日]*|コメント|SOLD OUT|レビュー|投稿|ページ|\([月火水木金土日]\)|質問|\d+話|楽天市場|-" # noqa 510 )
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p is 1, the filter will always be applied.
skip_rejected : bool
If True, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None, a new random number generator will be created.
If None, and use in the Compose class, the random state is shared with the Compose object.
use_batch : bool
If True, the filter will process documents in batches in the apply_stream method.
batch_size : int
The size of the batch to process documents in the apply_stream method.
kwargs : Any
Additional keyword arguments to pass to the filter.
512 def apply(self, doc: Document) -> Document: 513 """ 514 >>> DiscardBBSComments().apply(Document("楽天市場 質問 投稿 コメント レビュー "*3)).is_rejected 515 True 516 517 >>> DiscardBBSComments().apply(Document("鏡餅")).is_rejected 518 False 519 """ 520 bbs_factor = self.keyword_pat.findall(doc.text) 521 if len(bbs_factor) > self.max_allowed_num: 522 doc.is_rejected = True 523 return doc
>>> DiscardBBSComments().apply(Document("楽天市場 質問 投稿 コメント レビュー "*3)).is_rejected
True
>>> DiscardBBSComments().apply(Document("鏡餅")).is_rejected
False
526class DiscardAds(Filter): 527 """ 528 主に広告キーワードを`max_allow_num`より多く含む文書を破棄します. 529 デフォルトで`max_allow_num` は14です. 530 `dict_path` で指定したファイルから, 広告キーワードのリストを得ます. 531 ファイルは単語が改行で羅列されたテキストファイルです. 532 デフォルトの`dict_path` は /hojichar/dict/advertisement_keywords_ja.txt です. 533 """ 534 535 def __init__( 536 self, 537 dict_path: Union[str, PathLike] = BASE_PATH / "dict/advertisement_keywords_ja.txt", 538 max_allowed_num: int = 14, 539 *args: Any, 540 **kwargs: Any, 541 ): 542 super().__init__(*args, **kwargs) 543 544 self.max_allow_num = max_allowed_num 545 with open(dict_path, encoding="utf-8") as fp: 546 ng_words = fp.read().split("\n") 547 ng_words = [re.escape(w.strip()) for w in ng_words if not len(w) == 0] 548 pat = r"|".join(ng_words) 549 self.keyword_pat = re.compile(pat) 550 551 def apply(self, doc: Document) -> Document: 552 """ 553 >>> DiscardAds().apply(Document("お問い合わせください 営業時間 よくある質問"*5)).is_rejected 554 True 555 556 >>> DiscardAds().apply(Document("おはよう")).is_rejected 557 False 558 """ 559 ads_factor = self.keyword_pat.findall(doc.text) 560 if len(ads_factor) > self.max_allow_num: 561 doc.is_rejected = True 562 return doc
主に広告キーワードをmax_allow_numより多く含む文書を破棄します.
デフォルトでmax_allow_num は14です.
dict_path で指定したファイルから, 広告キーワードのリストを得ます.
ファイルは単語が改行で羅列されたテキストファイルです.
デフォルトのdict_path は /hojichar/dict/advertisement_keywords_ja.txt です.
535 def __init__( 536 self, 537 dict_path: Union[str, PathLike] = BASE_PATH / "dict/advertisement_keywords_ja.txt", 538 max_allowed_num: int = 14, 539 *args: Any, 540 **kwargs: Any, 541 ): 542 super().__init__(*args, **kwargs) 543 544 self.max_allow_num = max_allowed_num 545 with open(dict_path, encoding="utf-8") as fp: 546 ng_words = fp.read().split("\n") 547 ng_words = [re.escape(w.strip()) for w in ng_words if not len(w) == 0] 548 pat = r"|".join(ng_words) 549 self.keyword_pat = re.compile(pat)
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p is 1, the filter will always be applied.
skip_rejected : bool
If True, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None, a new random number generator will be created.
If None, and use in the Compose class, the random state is shared with the Compose object.
use_batch : bool
If True, the filter will process documents in batches in the apply_stream method.
batch_size : int
The size of the batch to process documents in the apply_stream method.
kwargs : Any
Additional keyword arguments to pass to the filter.
551 def apply(self, doc: Document) -> Document: 552 """ 553 >>> DiscardAds().apply(Document("お問い合わせください 営業時間 よくある質問"*5)).is_rejected 554 True 555 556 >>> DiscardAds().apply(Document("おはよう")).is_rejected 557 False 558 """ 559 ads_factor = self.keyword_pat.findall(doc.text) 560 if len(ads_factor) > self.max_allow_num: 561 doc.is_rejected = True 562 return doc
>>> DiscardAds().apply(Document("お問い合わせください 営業時間 よくある質問"*5)).is_rejected
True
>>> DiscardAds().apply(Document("おはよう")).is_rejected
False
565class AcceptJapanese(Filter): 566 """ 567 日本語でないドキュメントを破棄します. 日本語判定は次の手順で行われます. 568 1. テキストを左から`lookup_size` (デフォルトで50字) 参照し, 569 ひらがな・カタカナが存在すれば日本語と判定する. 570 """ 571 572 def __init__(self, lookup_size: int = 50, *args: Any, **kwargs: Any) -> None: 573 super().__init__(*args, **kwargs) 574 575 self.lookup_size = lookup_size 576 self.hiragana_katakana_pat = re.compile(r"[ぁ-んァ-ン]") 577 578 def apply(self, doc: Document) -> Document: 579 """ 580 >>> AcceptJapanese().apply(Document("This is English document")).is_rejected 581 True 582 583 >>> AcceptJapanese().apply(Document("a"*50 + "あ")).is_rejected 584 True 585 586 >>> AcceptJapanese().apply(Document("ほうじ茶")).is_rejected 587 False 588 """ 589 if not self.hiragana_katakana_pat.search(doc.text[: self.lookup_size]): 590 doc.is_rejected = True 591 return doc
日本語でないドキュメントを破棄します. 日本語判定は次の手順で行われます.
1. テキストを左からlookup_size (デフォルトで50字) 参照し,
ひらがな・カタカナが存在すれば日本語と判定する.
572 def __init__(self, lookup_size: int = 50, *args: Any, **kwargs: Any) -> None: 573 super().__init__(*args, **kwargs) 574 575 self.lookup_size = lookup_size 576 self.hiragana_katakana_pat = re.compile(r"[ぁ-んァ-ン]")
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p is 1, the filter will always be applied.
skip_rejected : bool
If True, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None, a new random number generator will be created.
If None, and use in the Compose class, the random state is shared with the Compose object.
use_batch : bool
If True, the filter will process documents in batches in the apply_stream method.
batch_size : int
The size of the batch to process documents in the apply_stream method.
kwargs : Any
Additional keyword arguments to pass to the filter.
578 def apply(self, doc: Document) -> Document: 579 """ 580 >>> AcceptJapanese().apply(Document("This is English document")).is_rejected 581 True 582 583 >>> AcceptJapanese().apply(Document("a"*50 + "あ")).is_rejected 584 True 585 586 >>> AcceptJapanese().apply(Document("ほうじ茶")).is_rejected 587 False 588 """ 589 if not self.hiragana_katakana_pat.search(doc.text[: self.lookup_size]): 590 doc.is_rejected = True 591 return doc
>>> AcceptJapanese().apply(Document("This is English document")).is_rejected
True
>>> AcceptJapanese().apply(Document("a"*50 + "あ")).is_rejected
True
>>> AcceptJapanese().apply(Document("ほうじ茶")).is_rejected
False
594class DiscardRareKuten(Filter): 595 """ 596 日本語でないドキュメントを破棄します. 日本語判定は次の手順で行われます 597 ドキュメントを句点"。"で区切り, 平均文長が 598 `max_avarage_sentence_length` より長い場合は破棄します. 599 `max_avarage_sentence_length` のデフォルト値は100です. 600 このフィルタは, 文章中の句点の割合が少なすぎるドキュメントを破棄します. 601 """ 602 603 def __init__(self, max_average_sentence_length: int = 100, *args: Any, **kwargs: Any) -> None: 604 super().__init__(*args, **kwargs) 605 606 self.max_average_sentence_length = max_average_sentence_length 607 self.kuten_pat = re.compile(r"。") 608 609 def apply(self, doc: Document) -> Document: 610 """ 611 >>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよ。")).is_rejected 612 False 613 >>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよう。")).is_rejected 614 True 615 """ 616 kuten_lst = self.kuten_pat.findall(doc.text) 617 min_kuten_num = len(doc.text) / self.max_average_sentence_length 618 if len(kuten_lst) < min_kuten_num: 619 doc.is_rejected = True 620 return doc
日本語でないドキュメントを破棄します. 日本語判定は次の手順で行われます
ドキュメントを句点"。"で区切り, 平均文長が
max_avarage_sentence_length より長い場合は破棄します.
max_avarage_sentence_length のデフォルト値は100です.
このフィルタは, 文章中の句点の割合が少なすぎるドキュメントを破棄します.
603 def __init__(self, max_average_sentence_length: int = 100, *args: Any, **kwargs: Any) -> None: 604 super().__init__(*args, **kwargs) 605 606 self.max_average_sentence_length = max_average_sentence_length 607 self.kuten_pat = re.compile(r"。")
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p is 1, the filter will always be applied.
skip_rejected : bool
If True, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None, a new random number generator will be created.
If None, and use in the Compose class, the random state is shared with the Compose object.
use_batch : bool
If True, the filter will process documents in batches in the apply_stream method.
batch_size : int
The size of the batch to process documents in the apply_stream method.
kwargs : Any
Additional keyword arguments to pass to the filter.
609 def apply(self, doc: Document) -> Document: 610 """ 611 >>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよ。")).is_rejected 612 False 613 >>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよう。")).is_rejected 614 True 615 """ 616 kuten_lst = self.kuten_pat.findall(doc.text) 617 min_kuten_num = len(doc.text) / self.max_average_sentence_length 618 if len(kuten_lst) < min_kuten_num: 619 doc.is_rejected = True 620 return doc
>>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよ。")).is_rejected
False
>>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよう。")).is_rejected
True
685class MaskPersonalInformation(Filter): 686 """ 687 ドキュメントに含まれる電話番号・電子メールアドレスを一部マスキングします. 688 """ 689 690 def __init__(self, *args: Any, **kwargs: Any) -> None: 691 super().__init__(*args, **kwargs) 692 693 self.phone_pat = re.compile( 694 r"((0|\+\d{1,3}[- ]?)(\d{2}[- ]?\d{4}[- ]?|\d[- ]?\d{4}[- ]?|\d{2}[- ]?\d{3}[- ]?|\d{3}[- ]?\d{2}[- ]?|\d{4}[- ]?\d{1}[- ]?))\d{4}" # noqa 695 ) 696 self.email_pat = re.compile( 697 r"[a-zA-Z0-9!#$%&'*+\-/=?^_`{|}~.]+@[A-Za-z0-9!#$%&'*+\-/=?^_`{|}~.]+(\.[A-Za-z0-9\-]+)" # noqa 698 ) 699 700 def apply(self, doc: Document) -> Document: 701 """ 702 >>> MaskPersonalInformation()('06-1234-5678') 703 '06-1234-XXXX' 704 >>> MaskPersonalInformation()('075-123-4567') 705 '075-123-XXXX' 706 >>> MaskPersonalInformation()('0166-12-3456') 707 '0166-12-XXXX' 708 >>> MaskPersonalInformation()('09808-1-2345') 709 '09808-1-XXXX' 710 >>> MaskPersonalInformation()('090-1234-5678') 711 '090-1234-XXXX' 712 >>> MaskPersonalInformation()('0751234567') 713 '075123XXXX' 714 >>> MaskPersonalInformation()('08012345678') 715 '0801234XXXX' 716 >>> MaskPersonalInformation()('連絡は075-123-4567 まで') 717 '連絡は075-123-XXXX まで' 718 >>> MaskPersonalInformation()('+81-80-1234-5678') 719 '+81-80-1234-XXXX' 720 >>> MaskPersonalInformation()('+818012345678') 721 '+81801234XXXX' 722 >>> MaskPersonalInformation()('hogehoge@example.com') 723 'xxxx@yyy.com' 724 >>> MaskPersonalInformation()('何かあれば hogehoge@example.ne.jp まで連絡') 725 '何かあれば xxxx@yyy.jp まで連絡' 726 """ 727 text = self.phone_pat.sub(r"\1XXXX", doc.text) 728 text = self.email_pat.sub(r"xxxx@yyy\1", text) 729 doc.text = text 730 return doc
ドキュメントに含まれる電話番号・電子メールアドレスを一部マスキングします.
690 def __init__(self, *args: Any, **kwargs: Any) -> None: 691 super().__init__(*args, **kwargs) 692 693 self.phone_pat = re.compile( 694 r"((0|\+\d{1,3}[- ]?)(\d{2}[- ]?\d{4}[- ]?|\d[- ]?\d{4}[- ]?|\d{2}[- ]?\d{3}[- ]?|\d{3}[- ]?\d{2}[- ]?|\d{4}[- ]?\d{1}[- ]?))\d{4}" # noqa 695 ) 696 self.email_pat = re.compile( 697 r"[a-zA-Z0-9!#$%&'*+\-/=?^_`{|}~.]+@[A-Za-z0-9!#$%&'*+\-/=?^_`{|}~.]+(\.[A-Za-z0-9\-]+)" # noqa 698 )
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p is 1, the filter will always be applied.
skip_rejected : bool
If True, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None, a new random number generator will be created.
If None, and use in the Compose class, the random state is shared with the Compose object.
use_batch : bool
If True, the filter will process documents in batches in the apply_stream method.
batch_size : int
The size of the batch to process documents in the apply_stream method.
kwargs : Any
Additional keyword arguments to pass to the filter.
700 def apply(self, doc: Document) -> Document: 701 """ 702 >>> MaskPersonalInformation()('06-1234-5678') 703 '06-1234-XXXX' 704 >>> MaskPersonalInformation()('075-123-4567') 705 '075-123-XXXX' 706 >>> MaskPersonalInformation()('0166-12-3456') 707 '0166-12-XXXX' 708 >>> MaskPersonalInformation()('09808-1-2345') 709 '09808-1-XXXX' 710 >>> MaskPersonalInformation()('090-1234-5678') 711 '090-1234-XXXX' 712 >>> MaskPersonalInformation()('0751234567') 713 '075123XXXX' 714 >>> MaskPersonalInformation()('08012345678') 715 '0801234XXXX' 716 >>> MaskPersonalInformation()('連絡は075-123-4567 まで') 717 '連絡は075-123-XXXX まで' 718 >>> MaskPersonalInformation()('+81-80-1234-5678') 719 '+81-80-1234-XXXX' 720 >>> MaskPersonalInformation()('+818012345678') 721 '+81801234XXXX' 722 >>> MaskPersonalInformation()('hogehoge@example.com') 723 'xxxx@yyy.com' 724 >>> MaskPersonalInformation()('何かあれば hogehoge@example.ne.jp まで連絡') 725 '何かあれば xxxx@yyy.jp まで連絡' 726 """ 727 text = self.phone_pat.sub(r"\1XXXX", doc.text) 728 text = self.email_pat.sub(r"xxxx@yyy\1", text) 729 doc.text = text 730 return doc
>>> MaskPersonalInformation()('06-1234-5678')
'06-1234-XXXX'
>>> MaskPersonalInformation()('075-123-4567')
'075-123-XXXX'
>>> MaskPersonalInformation()('0166-12-3456')
'0166-12-XXXX'
>>> MaskPersonalInformation()('09808-1-2345')
'09808-1-XXXX'
>>> MaskPersonalInformation()('090-1234-5678')
'090-1234-XXXX'
>>> MaskPersonalInformation()('0751234567')
'075123XXXX'
>>> MaskPersonalInformation()('08012345678')
'0801234XXXX'
>>> MaskPersonalInformation()('連絡は075-123-4567 まで')
'連絡は075-123-XXXX まで'
>>> MaskPersonalInformation()('+81-80-1234-5678')
'+81-80-1234-XXXX'
>>> MaskPersonalInformation()('+818012345678')
'+81801234XXXX'
>>> MaskPersonalInformation()('hogehoge@example.com')
'xxxx@yyy.com'
>>> MaskPersonalInformation()('何かあれば hogehoge@example.ne.jp まで連絡')
'何かあれば xxxx@yyy.jp まで連絡'
733class DiscardTooManyNouns(Filter): 734 """ 735 [!CAUTION] This filter requires `fugashi` package. Please install it 736 by `pip install 'hojichar[all]'`. 737 738 A filter that removes document with too many nouns in Japanese i.e., 739 documents such as advertisement, word salad, etc ... 740 """ 741 742 def __init__( 743 self, threshold: float = 0.80, max_parse_chars: int = 100_000, *args: Any, **kwargs: Any 744 ) -> None: 745 """ 746 Args: 747 threshold: document whose noun ratio is higher than this value will be discarded 748 max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document. 749 *args: 750 **kwargs: 751 """ 752 super().__init__(*args, **kwargs) 753 assert is_loaded_extras, ( 754 "fugashi is required for this filter. Try pip install 'hojichar[all]'" 755 ) 756 757 self.threshold = threshold 758 self.max_parse_chars = max_parse_chars 759 self.tagger = Tagger("-Owakati") 760 assert "unidic" in self.tagger.dictionary_info[0]["filename"], ( 761 "MeCab dictionary must be unidic" 762 ) 763 764 def _chunk_text(self, text: str) -> Iterable[str]: 765 """Slice text into chunks of `max_parse_chars` length.""" 766 step = self.max_parse_chars 767 for i in range(0, len(text), step): 768 yield text[i : i + step] 769 770 def apply(self, doc: Document) -> Document: 771 """ 772 >>> DiscardTooManyNouns().apply(Document("自然言語処理大好き!")).is_rejected 773 False 774 >>> DiscardTooManyNouns().apply(Document("リンゴ・オレンジ・ミカン・バナナ セール中")).is_rejected 775 True 776 >>> DiscardTooManyNouns().apply(Document("今日の仙台朝市ではリンゴがセール中")).is_rejected 777 False 778 """ 779 # remove "補助記号" from part-of-speech statistics 780 # because they often decrease the noun ratio, 781 # e.g., the sentence "リンゴ・オレンジ・バナナ・" has 補助記号 ratio of 0.5 782 # however, we don't want such sentence 783 784 pos_count: Counter[str] = Counter() 785 for chunk in self._chunk_text(doc.text): 786 for word in self.tagger(chunk): 787 if word.feature.pos1 != "補助記号": 788 pos_count[word.feature.pos1] += 1 789 790 try: 791 noun_ratio = pos_count["名詞"] / sum(pos_count.values()) 792 except ZeroDivisionError: 793 noun_ratio = 0.0 794 if noun_ratio >= self.threshold: 795 doc.is_rejected = True 796 return doc
[!CAUTION] This filter requires fugashi package. Please install it
by pip install 'hojichar[all]'.
A filter that removes document with too many nouns in Japanese i.e., documents such as advertisement, word salad, etc ...
742 def __init__( 743 self, threshold: float = 0.80, max_parse_chars: int = 100_000, *args: Any, **kwargs: Any 744 ) -> None: 745 """ 746 Args: 747 threshold: document whose noun ratio is higher than this value will be discarded 748 max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document. 749 *args: 750 **kwargs: 751 """ 752 super().__init__(*args, **kwargs) 753 assert is_loaded_extras, ( 754 "fugashi is required for this filter. Try pip install 'hojichar[all]'" 755 ) 756 757 self.threshold = threshold 758 self.max_parse_chars = max_parse_chars 759 self.tagger = Tagger("-Owakati") 760 assert "unidic" in self.tagger.dictionary_info[0]["filename"], ( 761 "MeCab dictionary must be unidic" 762 )
Args: threshold: document whose noun ratio is higher than this value will be discarded max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document. args: *kwargs:
770 def apply(self, doc: Document) -> Document: 771 """ 772 >>> DiscardTooManyNouns().apply(Document("自然言語処理大好き!")).is_rejected 773 False 774 >>> DiscardTooManyNouns().apply(Document("リンゴ・オレンジ・ミカン・バナナ セール中")).is_rejected 775 True 776 >>> DiscardTooManyNouns().apply(Document("今日の仙台朝市ではリンゴがセール中")).is_rejected 777 False 778 """ 779 # remove "補助記号" from part-of-speech statistics 780 # because they often decrease the noun ratio, 781 # e.g., the sentence "リンゴ・オレンジ・バナナ・" has 補助記号 ratio of 0.5 782 # however, we don't want such sentence 783 784 pos_count: Counter[str] = Counter() 785 for chunk in self._chunk_text(doc.text): 786 for word in self.tagger(chunk): 787 if word.feature.pos1 != "補助記号": 788 pos_count[word.feature.pos1] += 1 789 790 try: 791 noun_ratio = pos_count["名詞"] / sum(pos_count.values()) 792 except ZeroDivisionError: 793 noun_ratio = 0.0 794 if noun_ratio >= self.threshold: 795 doc.is_rejected = True 796 return doc
>>> DiscardTooManyNouns().apply(Document("自然言語処理大好き!")).is_rejected
False
>>> DiscardTooManyNouns().apply(Document("リンゴ・オレンジ・ミカン・バナナ セール中")).is_rejected
True
>>> DiscardTooManyNouns().apply(Document("今日の仙台朝市ではリンゴがセール中")).is_rejected
False
799class CharRepetitionRatioFilter(Filter): 800 """ 801 文字Ngramの重なり率(文書中で高頻度文字Ngramが占める割合)を計算して, 重なりの大きいものを除去します. 802 名詞の連続からなるような広告テキストを取り除くのに有効です. 803 804 実装は, BigScience で採用されていた前処理を参考にしています. 805 元実装: https://github.com/bigscience-workshop/data-preparation/blob/9d0588419073cc5bf0fb92b58f37f2a1016572c3/preprocessing/training/01b_oscar_cleaning_and_filtering/filtering.py#L425-L453 # noqa: E501 806 807 「高頻度文字Ngram」は、sqrt(ユニークなNgramの総数)によって求めていますが, 808 これは文書長の影響を軽減するためだとされています. 809 810 掲示板のテキストが引っかかりやすい傾向があります. 811 13: 名無しさん@実況で競馬板アウト 2019/08/18(日) 15:28:46.10 ID:eBvZg8h+0 812 的なものが高頻度で登場するため、文字Ngramの重なり率も高くなってしまう 813 """ 814 815 def __init__( 816 self, threshold: float = 0.33, ngram_size: int = 5, *args: Any, **kwargs: Any 817 ) -> None: 818 """ 819 820 Args: 821 threshold: document with character repetition ratio higher than this value will be discarded 822 ngram_size: character ngram size. Larger value will decrease the false positive of long documents 823 *args: 824 **kwargs: 825 """ # noqa: E501 826 827 super().__init__(*args, **kwargs) 828 self.threshold = threshold 829 self.ngram_size = ngram_size 830 831 def apply(self, doc: Document) -> Document: 832 ratio = self.compute_character_repetition_ratio(doc.text, self.ngram_size) 833 if ratio >= self.threshold: 834 doc.is_rejected = True 835 return doc 836 837 @staticmethod 838 def compute_character_repetition_ratio( 839 document: str, character_repetition_length: int 840 ) -> float: 841 def get_freq_character_ngrams(document: str, n: int) -> Dict[str, int]: 842 character_ngrams: List[str] = [ 843 document[i : i + n] for i in range(len(document) - n + 1) 844 ] 845 freq_character_ngrams_dict: Dict[str, int] = {} 846 for character_ngram in character_ngrams: 847 freq_character_ngrams_dict[character_ngram] = ( 848 freq_character_ngrams_dict.get(character_ngram, 0) + 1 849 ) 850 return freq_character_ngrams_dict 851 852 freq_character_ngrams_dict = get_freq_character_ngrams( 853 document, character_repetition_length 854 ) 855 if len(freq_character_ngrams_dict) == 0: 856 return 0.0 857 freq_character_ngrams: List[int] = list(freq_character_ngrams_dict.values()) 858 freq_character_ngrams = sorted(freq_character_ngrams, reverse=True) 859 val_one = len([el for el in freq_character_ngrams if el == 1]) 860 num_rep_character_ngrams = min( 861 int(np.sqrt(len(freq_character_ngrams))), 862 len(freq_character_ngrams) - val_one, 863 ) 864 character_repetition_ratio = sum(freq_character_ngrams[:num_rep_character_ngrams]) / sum( 865 freq_character_ngrams 866 ) 867 return character_repetition_ratio
文字Ngramの重なり率(文書中で高頻度文字Ngramが占める割合)を計算して, 重なりの大きいものを除去します. 名詞の連続からなるような広告テキストを取り除くのに有効です.
実装は, BigScience で採用されていた前処理を参考にしています. 元実装: https://github.com/bigscience-workshop/data-preparation/blob/9d0588419073cc5bf0fb92b58f37f2a1016572c3/preprocessing/training/01b_oscar_cleaning_and_filtering/filtering.py#L425-L453 # noqa: E501
「高頻度文字Ngram」は、sqrt(ユニークなNgramの総数)によって求めていますが, これは文書長の影響を軽減するためだとされています.
掲示板のテキストが引っかかりやすい傾向があります. 13: 名無しさん@実況で競馬板アウト 2019/08/18(日) 15:28:46.10 ID:eBvZg8h+0 的なものが高頻度で登場するため、文字Ngramの重なり率も高くなってしまう
815 def __init__( 816 self, threshold: float = 0.33, ngram_size: int = 5, *args: Any, **kwargs: Any 817 ) -> None: 818 """ 819 820 Args: 821 threshold: document with character repetition ratio higher than this value will be discarded 822 ngram_size: character ngram size. Larger value will decrease the false positive of long documents 823 *args: 824 **kwargs: 825 """ # noqa: E501 826 827 super().__init__(*args, **kwargs) 828 self.threshold = threshold 829 self.ngram_size = ngram_size
Args: threshold: document with character repetition ratio higher than this value will be discarded ngram_size: character ngram size. Larger value will decrease the false positive of long documents args: *kwargs:
831 def apply(self, doc: Document) -> Document: 832 ratio = self.compute_character_repetition_ratio(doc.text, self.ngram_size) 833 if ratio >= self.threshold: 834 doc.is_rejected = True 835 return doc
Definition of filter behavior.
The document must have a protocol TextContent,
and mostly used hojichar.Document class.
In this method, the filter will modify document.text or
document.extras and set document.is_rejected = True to discard the document.
Parameters
document : Document Input document
Returns
Document Processed Document
837 @staticmethod 838 def compute_character_repetition_ratio( 839 document: str, character_repetition_length: int 840 ) -> float: 841 def get_freq_character_ngrams(document: str, n: int) -> Dict[str, int]: 842 character_ngrams: List[str] = [ 843 document[i : i + n] for i in range(len(document) - n + 1) 844 ] 845 freq_character_ngrams_dict: Dict[str, int] = {} 846 for character_ngram in character_ngrams: 847 freq_character_ngrams_dict[character_ngram] = ( 848 freq_character_ngrams_dict.get(character_ngram, 0) + 1 849 ) 850 return freq_character_ngrams_dict 851 852 freq_character_ngrams_dict = get_freq_character_ngrams( 853 document, character_repetition_length 854 ) 855 if len(freq_character_ngrams_dict) == 0: 856 return 0.0 857 freq_character_ngrams: List[int] = list(freq_character_ngrams_dict.values()) 858 freq_character_ngrams = sorted(freq_character_ngrams, reverse=True) 859 val_one = len([el for el in freq_character_ngrams if el == 1]) 860 num_rep_character_ngrams = min( 861 int(np.sqrt(len(freq_character_ngrams))), 862 len(freq_character_ngrams) - val_one, 863 ) 864 character_repetition_ratio = sum(freq_character_ngrams[:num_rep_character_ngrams]) / sum( 865 freq_character_ngrams 866 ) 867 return character_repetition_ratio
870class WordRepetitionRatioFilter(Filter): 871 """ 872 [!CAUTION] This filter requires `fugashi` package. Please install it 873 by `pip install 'hojichar[all]'`. 874 875 単語Ngramの重なり率(文書中で重複する単語Ngramが占める割合)を計算して、重なりの大きいものを弾くためのフィルタ. 876 BigScienceで採用されていた前処理を参考にしている. 877 878 名詞が連打されているような広告テキストを取り除くのに有効な様子 879 まともな文書がたまたま2回繰り返されている場合もあり、これを取り除いて良いのかは分からない 880 例: 881 "ウェブ\n本文: ニコンの上昇率16%超える、今3月期は経常76%の大幅増益見込む(ニコン) 2013年05月10日[minkabu PRESS] - みんなの株式 (みんかぶ)\n2013/05/10(10:57) 882 ニコン<7731.T>が急騰、寄り付き直後に前日比355円高の2537円まで買い上げ 883 られ、上昇率は16%を超えた。外国為替市場で円が1ドル100円台、1ユーロ131円台に入るなど急速に円安が進み、輸出株が軒並み高になる 884 なか、9日取引終了後に発表した前年3月期決算で、今3月期は2ケタ近い増収で大幅増益を見込んだことが買い気を強めさせた。連結売上 885 高は前期比9.8%増の1兆1100億円、経常利益75.8%増の850億円を予想。前期は半導体、電子部品の低迷が足かせになり、2ケタ増収ながら 886 経常46%の大幅減益になったが、レンズ交換式デジタルカメラの拡大や液晶ディスプレイの回復で収益が急回復する。ニコンの株価は10時 887 56分現在2491円(△309円)出所:株経通信(株式会社みんかぶ)\n2013/05/10 - ニコン(7731) の関連ニュース。 ニコン<7731.T>が急騰、寄 888 り付き直後に前日比355円高の2537円まで買い上げられ、上昇率は16%を超えた。外国為替市場で円が1ドル100円台、1ユーロ131円台に入 889 るなど急速に円安が進み、輸出株が軒並み高になるなか、9日取引終了後に発表した前年3月期決算で、今3月期は2ケタ近い増収で大幅増 890 益を見込んだことが買い気を強めさせた。連結売上高は前期比9.8%増の1兆1100億円、経常利益75.8%増の850億円を予想。前期は半導体、 891 電子部品の低迷が足かせになり、2ケタ増収ながら経常46%の大幅減益になったが、レンズ交換式デジタルカメラの拡大や液晶ディスプレ 892 イの回復で収益が急回" 893 """ # noqa: E501 894 895 def __init__( 896 self, 897 threshold: float = 0.40, 898 ngram_size: int = 7, 899 max_parse_chars: int = 100_000, 900 *args: Any, 901 **kwargs: Any, 902 ) -> None: 903 """ 904 905 Args: 906 threshold: document whose character repetition ratio is higher than this value will be discarded 907 ngram_size: character ngram size. Larger value will decrease the false positive of long documents 908 max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document. 909 *args: 910 **kwargs: 911 """ # noqa: E501 912 super().__init__(*args, **kwargs) 913 assert is_loaded_extras, ( 914 "fugashi is required for this filter. Try pip install 'hojichar[all]'" 915 ) 916 917 self.threshold = threshold 918 self.ngram_size = ngram_size 919 self.max_parse_chars = max_parse_chars 920 self.tagger = Tagger("-Owakati") 921 922 def _chunk_text(self, text: str) -> Iterable[str]: 923 """Split text into chunks of `max_parse_chars` length.""" 924 step = self.max_parse_chars 925 for i in range(0, len(text), step): 926 yield text[i : i + step] 927 928 def _get_freq_word_ngrams(self, words: List[str], n: int) -> Dict[str, int]: 929 freq: Dict[str, int] = {} 930 if n <= 0 or len(words) < n: 931 return freq 932 for i in range(len(words) - n + 1): 933 key = " ".join(words[i : i + n]) 934 freq[key] = freq.get(key, 0) + 1 935 return freq 936 937 def apply(self, doc: Document) -> Document: 938 ratio = self.compute_word_repetition_ratio(doc.text, self.ngram_size) 939 if ratio >= self.threshold: 940 doc.is_rejected = True 941 return doc 942 943 def compute_word_repetition_ratio(self, document: str, n: int) -> float: 944 total_counter: Counter[str] = Counter() 945 946 for chunk in self._chunk_text(document): 947 words = [w.surface for w in self.tagger(chunk)] 948 total_counter.update(self._get_freq_word_ngrams(words, n)) 949 950 if not total_counter: 951 return 0.0 952 953 total = sum(total_counter.values()) 954 repeated = sum(v for v in total_counter.values() if v > 1) 955 return repeated / total
[!CAUTION] This filter requires fugashi package. Please install it
by pip install 'hojichar[all]'.
単語Ngramの重なり率(文書中で重複する単語Ngramが占める割合)を計算して、重なりの大きいものを弾くためのフィルタ.
BigScienceで採用されていた前処理を参考にしている.
名詞が連打されているような広告テキストを取り除くのに有効な様子
まともな文書がたまたま2回繰り返されている場合もあり、これを取り除いて良いのかは分からない
例:
"ウェブ
本文: ニコンの上昇率16%超える、今3月期は経常76%の大幅増益見込む(ニコン) 2013年05月10日[minkabu PRESS] - みんなの株式 (みんかぶ) 2013/05/10(10:57) ニコン<7731.T>が急騰、寄り付き直後に前日比355円高の2537円まで買い上げ られ、上昇率は16%を超えた。外国為替市場で円が1ドル100円台、1ユーロ131円台に入るなど急速に円安が進み、輸出株が軒並み高になる なか、9日取引終了後に発表した前年3月期決算で、今3月期は2ケタ近い増収で大幅増益を見込んだことが買い気を強めさせた。連結売上 高は前期比9.8%増の1兆1100億円、経常利益75.8%増の850億円を予想。前期は半導体、電子部品の低迷が足かせになり、2ケタ増収ながら 経常46%の大幅減益になったが、レンズ交換式デジタルカメラの拡大や液晶ディスプレイの回復で収益が急回復する。ニコンの株価は10時 56分現在2491円(△309円)出所:株経通信(株式会社みんかぶ) 2013/05/10 - ニコン(7731) の関連ニュース。 ニコン<7731.T>が急騰、寄 り付き直後に前日比355円高の2537円まで買い上げられ、上昇率は16%を超えた。外国為替市場で円が1ドル100円台、1ユーロ131円台に入 るなど急速に円安が進み、輸出株が軒並み高になるなか、9日取引終了後に発表した前年3月期決算で、今3月期は2ケタ近い増収で大幅増 益を見込んだことが買い気を強めさせた。連結売上高は前期比9.8%増の1兆1100億円、経常利益75.8%増の850億円を予想。前期は半導体、 電子部品の低迷が足かせになり、2ケタ増収ながら経常46%の大幅減益になったが、レンズ交換式デジタルカメラの拡大や液晶ディスプレ イの回復で収益が急回"
895 def __init__( 896 self, 897 threshold: float = 0.40, 898 ngram_size: int = 7, 899 max_parse_chars: int = 100_000, 900 *args: Any, 901 **kwargs: Any, 902 ) -> None: 903 """ 904 905 Args: 906 threshold: document whose character repetition ratio is higher than this value will be discarded 907 ngram_size: character ngram size. Larger value will decrease the false positive of long documents 908 max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document. 909 *args: 910 **kwargs: 911 """ # noqa: E501 912 super().__init__(*args, **kwargs) 913 assert is_loaded_extras, ( 914 "fugashi is required for this filter. Try pip install 'hojichar[all]'" 915 ) 916 917 self.threshold = threshold 918 self.ngram_size = ngram_size 919 self.max_parse_chars = max_parse_chars 920 self.tagger = Tagger("-Owakati")
Args: threshold: document whose character repetition ratio is higher than this value will be discarded ngram_size: character ngram size. Larger value will decrease the false positive of long documents max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document. args: *kwargs:
937 def apply(self, doc: Document) -> Document: 938 ratio = self.compute_word_repetition_ratio(doc.text, self.ngram_size) 939 if ratio >= self.threshold: 940 doc.is_rejected = True 941 return doc
Definition of filter behavior.
The document must have a protocol TextContent,
and mostly used hojichar.Document class.
In this method, the filter will modify document.text or
document.extras and set document.is_rejected = True to discard the document.
Parameters
document : Document Input document
Returns
Document Processed Document
943 def compute_word_repetition_ratio(self, document: str, n: int) -> float: 944 total_counter: Counter[str] = Counter() 945 946 for chunk in self._chunk_text(document): 947 words = [w.surface for w in self.tagger(chunk)] 948 total_counter.update(self._get_freq_word_ngrams(words, n)) 949 950 if not total_counter: 951 return 0.0 952 953 total = sum(total_counter.values()) 954 repeated = sum(v for v in total_counter.values() if v > 1) 955 return repeated / total
958class DiscardTooManySpecialToken(Filter): 959 """ 960 [!CAUTION] This filter requires `emoji` package. Please install it 961 by `pip install 'hojichar[all]'`. 962 963 句読点を含む記号、空白、絵文字、その他特殊な文字を一定の割合以上含むような文書を取り除くためのフィルタ 964 元実装: BigScience https://github.com/bigscience-workshop/data-preparation/blob/9d0588419073cc5bf0fb92b58f37f2a1016572c3/preprocessing/training/01b_oscar_cleaning_and_filtering/parameters_filtering.py#L5-L16 # noqa: E501 965 """ 966 967 def __init__(self, threshold: float = 0.4, *args: Any, **kwargs: Any) -> None: 968 """ 969 970 Args: 971 threshold: document whose special token ratio is higher than this value will be discarded 972 *args: 973 **kwargs: 974 """ # noqa: E501 975 super().__init__(*args, **kwargs) 976 977 # digits are not regarded as special tokens 978 # otherwise many false positives are made, i.e., good documents discarded 979 main_special_characters = string.punctuation + string.whitespace # + string.digits 980 other_special_characters = ( 981 " ’“”–▬…✦�£•€«»°·═" 982 "×士^˘⇓()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰ ‑≤≥‖" 983 "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†:⁄♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚" 984 "゜ʼ≖ʼ¤℃√!?【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖" 985 "」﴾》�" 986 ) 987 988 en_emoji = emoji.EMOJI_DATA.keys() 989 990 special_characters_default = set(main_special_characters + other_special_characters) 991 special_characters_default.update(en_emoji) 992 self.special_characters = special_characters_default 993 994 self.threshold = threshold 995 996 def _compute_special_characters_ratio(self, text: str) -> float: 997 if len(text) == 0: 998 return 0 999 1000 special_characters_ratio = len( 1001 [char for char in text if char in self.special_characters] 1002 ) / len(text) 1003 return special_characters_ratio 1004 1005 def apply(self, doc: Document) -> Document: 1006 special_characters_ratio = self._compute_special_characters_ratio(doc.text) 1007 1008 if special_characters_ratio > self.threshold: 1009 doc.is_rejected = True 1010 return doc
[!CAUTION] This filter requires emoji package. Please install it
by pip install 'hojichar[all]'.
句読点を含む記号、空白、絵文字、その他特殊な文字を一定の割合以上含むような文書を取り除くためのフィルタ 元実装: BigScience https://github.com/bigscience-workshop/data-preparation/blob/9d0588419073cc5bf0fb92b58f37f2a1016572c3/preprocessing/training/01b_oscar_cleaning_and_filtering/parameters_filtering.py#L5-L16 # noqa: E501
967 def __init__(self, threshold: float = 0.4, *args: Any, **kwargs: Any) -> None: 968 """ 969 970 Args: 971 threshold: document whose special token ratio is higher than this value will be discarded 972 *args: 973 **kwargs: 974 """ # noqa: E501 975 super().__init__(*args, **kwargs) 976 977 # digits are not regarded as special tokens 978 # otherwise many false positives are made, i.e., good documents discarded 979 main_special_characters = string.punctuation + string.whitespace # + string.digits 980 other_special_characters = ( 981 " ’“”–▬…✦�£•€«»°·═" 982 "×士^˘⇓()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰ ‑≤≥‖" 983 "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†:⁄♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚" 984 "゜ʼ≖ʼ¤℃√!?【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖" 985 "」﴾》�" 986 ) 987 988 en_emoji = emoji.EMOJI_DATA.keys() 989 990 special_characters_default = set(main_special_characters + other_special_characters) 991 special_characters_default.update(en_emoji) 992 self.special_characters = special_characters_default 993 994 self.threshold = threshold
Args: threshold: document whose special token ratio is higher than this value will be discarded args: *kwargs:
1005 def apply(self, doc: Document) -> Document: 1006 special_characters_ratio = self._compute_special_characters_ratio(doc.text) 1007 1008 if special_characters_ratio > self.threshold: 1009 doc.is_rejected = True 1010 return doc
Definition of filter behavior.
The document must have a protocol TextContent,
and mostly used hojichar.Document class.
In this method, the filter will modify document.text or
document.extras and set document.is_rejected = True to discard the document.
Parameters
document : Document Input document
Returns
Document Processed Document
1013class SingleCharacterRepetitionFilter(Filter): 1014 """ 1015 単一文字が大量に繰り返されているような文書を取り除くためのフィルタ 1016 そのような文書はノイズである可能性が高いため 1017 参考: BigScienceプロジェクトによると、oscarデータセットの中にバックスラッシュだけを2M個含むような文書が含まれていたらしい 1018 https://github.com/bigscience-workshop/bigscience/blob/master/train/tr8-104B-wide/chronicles.md#2m-backslash-only-samples-in-our-dataset # noqa: E501 1019 """ 1020 1021 def __init__( 1022 self, 1023 threshold: int = 200, 1024 *args: Any, 1025 **kwargs: Any, 1026 ) -> None: 1027 """ 1028 Args: 1029 threshold: The document is removed if character is repeated for this value or more 1030 *args: 1031 **kwargs: 1032 """ 1033 super().__init__(*args, **kwargs) 1034 self.threshold = threshold 1035 1036 def _is_repeat_contained(self, text: str) -> bool: 1037 groups = groupby(text) 1038 is_repeat_contained = any(sum(1 for _ in group) >= self.threshold for _, group in groups) 1039 return is_repeat_contained 1040 1041 def apply(self, doc: Document) -> Document: 1042 if self._is_repeat_contained(doc.text): 1043 doc.is_rejected = True 1044 return doc
単一文字が大量に繰り返されているような文書を取り除くためのフィルタ そのような文書はノイズである可能性が高いため 参考: BigScienceプロジェクトによると、oscarデータセットの中にバックスラッシュだけを2M個含むような文書が含まれていたらしい https://github.com/bigscience-workshop/bigscience/blob/master/train/tr8-104B-wide/chronicles.md#2m-backslash-only-samples-in-our-dataset # noqa: E501
1021 def __init__( 1022 self, 1023 threshold: int = 200, 1024 *args: Any, 1025 **kwargs: Any, 1026 ) -> None: 1027 """ 1028 Args: 1029 threshold: The document is removed if character is repeated for this value or more 1030 *args: 1031 **kwargs: 1032 """ 1033 super().__init__(*args, **kwargs) 1034 self.threshold = threshold
Args: threshold: The document is removed if character is repeated for this value or more args: *kwargs:
1041 def apply(self, doc: Document) -> Document: 1042 if self._is_repeat_contained(doc.text): 1043 doc.is_rejected = True 1044 return doc
Definition of filter behavior.
The document must have a protocol TextContent,
and mostly used hojichar.Document class.
In this method, the filter will modify document.text or
document.extras and set document.is_rejected = True to discard the document.
Parameters
document : Document Input document
Returns
Document Processed Document
1047class DiscardTooManyEndingEllipsis(Filter): 1048 """ 1049 ellipsisで終わるような行が大量に含まれるような文書を取り除くためのフィルタです. 1050 ellipsisとしては ... と … を用いている 1051 同様のフィルタが RedPajama v2で用いられています. 1052 1053 例として, 以下のような文書を検知します. 1054 ``` 1055 ペアーズは女性、という驚愕の過食が出ているのをごアラサーですか。時代から付... 1056 バツイチアラフォー 婚活ち女性の特徴と子持な付... 1057 ``` 1058 1059 デフォルトではしきい値を0.7としているが, これはC4から0.1%を削るような設定であり、 1060 precisionを重視した設定です. 1061 """ 1062 1063 def __init__( 1064 self, 1065 threshold: float = 0.7, 1066 *args: Any, 1067 **kwargs: Any, 1068 ) -> None: 1069 """ 1070 Args: 1071 threshold: The document is removed if ratio of lines ending with ellipsis is higher than this value 1072 *args: 1073 **kwargs: 1074 """ # noqa: E501 1075 super().__init__(*args, **kwargs) 1076 self.threshold = threshold 1077 self.ellipsis_pattern = re.compile(r"(\.{3}|…)\n") # matches ...\n and …\n 1078 1079 def apply(self, doc: Document) -> Document: 1080 ellipsis_count = len(self.ellipsis_pattern.findall(doc.text)) 1081 newline_count = max(doc.text.count("\n"), 1) # avoid zero division 1082 ellipsis_ratio = ellipsis_count / newline_count 1083 1084 if ellipsis_ratio > self.threshold: 1085 doc.is_rejected = True 1086 return doc
ellipsisで終わるような行が大量に含まれるような文書を取り除くためのフィルタです. ellipsisとしては ... と … を用いている 同様のフィルタが RedPajama v2で用いられています.
例として, 以下のような文書を検知します.
ペアーズは女性、という驚愕の過食が出ているのをごアラサーですか。時代から付...
バツイチアラフォー 婚活ち女性の特徴と子持な付...
デフォルトではしきい値を0.7としているが, これはC4から0.1%を削るような設定であり、 precisionを重視した設定です.
1063 def __init__( 1064 self, 1065 threshold: float = 0.7, 1066 *args: Any, 1067 **kwargs: Any, 1068 ) -> None: 1069 """ 1070 Args: 1071 threshold: The document is removed if ratio of lines ending with ellipsis is higher than this value 1072 *args: 1073 **kwargs: 1074 """ # noqa: E501 1075 super().__init__(*args, **kwargs) 1076 self.threshold = threshold 1077 self.ellipsis_pattern = re.compile(r"(\.{3}|…)\n") # matches ...\n and …\n
Args: threshold: The document is removed if ratio of lines ending with ellipsis is higher than this value args: *kwargs:
1079 def apply(self, doc: Document) -> Document: 1080 ellipsis_count = len(self.ellipsis_pattern.findall(doc.text)) 1081 newline_count = max(doc.text.count("\n"), 1) # avoid zero division 1082 ellipsis_ratio = ellipsis_count / newline_count 1083 1084 if ellipsis_ratio > self.threshold: 1085 doc.is_rejected = True 1086 return doc
Definition of filter behavior.
The document must have a protocol TextContent,
and mostly used hojichar.Document class.
In this method, the filter will modify document.text or
document.extras and set document.is_rejected = True to discard the document.
Parameters
document : Document Input document
Returns
Document Processed Document
1089class DiscardTooShortLines(Filter): 1090 """ 1091 短い行を大量に含む文書を捨てるためのフィルタです. 1092 1093 メニューバーやパンくずリストのような要素を大量に含む文書を取り除くのに有効です. 1094 """ 1095 1096 def __init__(self, threshold: float = 0.5, *args: Any, **kwargs: Any) -> None: 1097 """ 1098 Args: 1099 threshold: The document is removed if the ratio of short (<10 chars) lines are more than this value. 1100 *args: 1101 **kwargs: 1102 """ # noqa: E501 1103 super().__init__(*args, **kwargs) 1104 self.threshold = threshold 1105 # この値は適当に決め打ち 1106 self.minimum_line_length = 10 1107 1108 def apply(self, doc: Document) -> Document: 1109 lines = [len(x) for x in doc.text.split("\n")] 1110 short_lines = [x for x in lines if x <= self.minimum_line_length] 1111 if (len(short_lines) / len(lines)) > self.threshold: 1112 doc.is_rejected = True 1113 return doc
短い行を大量に含む文書を捨てるためのフィルタです.
メニューバーやパンくずリストのような要素を大量に含む文書を取り除くのに有効です.
1096 def __init__(self, threshold: float = 0.5, *args: Any, **kwargs: Any) -> None: 1097 """ 1098 Args: 1099 threshold: The document is removed if the ratio of short (<10 chars) lines are more than this value. 1100 *args: 1101 **kwargs: 1102 """ # noqa: E501 1103 super().__init__(*args, **kwargs) 1104 self.threshold = threshold 1105 # この値は適当に決め打ち 1106 self.minimum_line_length = 10
Args: threshold: The document is removed if the ratio of short (<10 chars) lines are more than this value. args: *kwargs:
1108 def apply(self, doc: Document) -> Document: 1109 lines = [len(x) for x in doc.text.split("\n")] 1110 short_lines = [x for x in lines if x <= self.minimum_line_length] 1111 if (len(short_lines) / len(lines)) > self.threshold: 1112 doc.is_rejected = True 1113 return doc
Definition of filter behavior.
The document must have a protocol TextContent,
and mostly used hojichar.Document class.
In this method, the filter will modify document.text or
document.extras and set document.is_rejected = True to discard the document.
Parameters
document : Document Input document
Returns
Document Processed Document