hojichar.filters.document_filters
1import json 2import logging 3import pathlib 4import re 5import string 6import time 7import unicodedata 8from collections import Counter 9from itertools import groupby 10from os import PathLike 11from typing import Any, Dict, Iterable, List, Optional, Union 12 13import numpy as np 14 15import hojichar 16from hojichar.core.filter_interface import Filter 17from hojichar.core.models import Document, Token 18 19try: 20 import emoji 21 from fugashi import Tagger # type: ignore 22 23 is_loaded_extras = True 24except ImportError: 25 is_loaded_extras = False 26 27BASE_PATH = pathlib.Path(hojichar.__path__[0]) 28logger = logging.getLogger(__name__) 29 30 31class ExampleHojiChar(Filter): 32 """基本的なフィルタの実装例です. 末尾に'<hojichar>'を追加します.""" 33 34 def apply(self, document: Document) -> Document: 35 """ 36 >>> ExampleHojiChar()("hello, world") 37 'hello, world<hojichar>' 38 """ 39 document.text += "<hojichar>" 40 return document 41 42 43class ExampleDiscardDocumentContainKeyword(Filter): 44 """特定のキーワードを持つドキュメントを破棄するようなフィルタの実装例です.""" 45 46 def __init__(self, keyword: str, *args: Any, **kwargs: Any) -> None: 47 super().__init__(*args, **kwargs) 48 self.keyword = keyword 49 50 def apply(self, document: Document) -> Document: 51 """ 52 >>> ExampleDiscardDocumentContainKeyword("バカ").apply(Document("あいつはバカだ")).is_rejected 53 True 54 """ 55 if self.keyword in document.text: 56 document.is_rejected = True 57 return document 58 59 60class Identity(Filter): 61 """何も変化を加えないフィルタです. テスト・デバッグに用いられます.""" 62 63 def apply(self, document: Document) -> Document: 64 return document 65 66 67class DiscardAll(Filter): 68 """ 69 すべてのドキュメントを破棄するフィルタです. 70 テスト・デバッグに用いられます. 71 """ 72 73 def apply(self, document: Document) -> Document: 74 document.is_rejected = True 75 return document 76 77 78class ApplyDiscard(Filter): 79 """ 80 上流フィルタで破棄された`Document`を空文字列にします. 81 82 `Document.is_rejected=True` の ドキュメントは無視されるため, 83 このフィルタを `Compose` のコンストラクタに渡しても動作しません. 84 このフィルタは主に`Compose` 内部や, `discard_filtered=False` を指定 85 したデバッグ時などに利用されます. 86 """ 87 88 def __init__(self, *args: Any, **kwargs: Any) -> None: 89 super().__init__(*args, **kwargs) 90 91 def apply(self, document: Document) -> Document: 92 """ 93 >>> ApplyDiscard().apply(Document(text="hello", is_rejected=True)).text 94 '' 95 """ 96 if document.is_rejected: 97 document.text = "" 98 99 return document 100 101 102class Sleep(Filter): 103 """ 104 デバッグ用のフィルタです. 指定秒スリープします. 105 """ 106 107 def __init__(self, time: float = 1.0, *args: Any, **kwargs: Any) -> None: 108 super().__init__(*args, **kwargs) 109 self.time = time 110 111 def apply(self, document: Document) -> Document: 112 """ 113 >>> Sleep(0.1)('hello') # After 0.1 seconds, 114 'hello' 115 """ 116 time.sleep(self.time) 117 return document 118 119 120class DocumentNormalizer(Filter): 121 """ 122 Unicode の正規化をします. 123 """ 124 125 def __init__(self, *args: Any, **kwargs: Any) -> None: 126 super().__init__(*args, **kwargs) 127 128 def apply(self, document: Document) -> Document: 129 document.text = unicodedata.normalize("NFKC", document.text) 130 return document 131 132 133class JSONLoader(Filter): 134 """ 135 テキストを Json として解釈し, `key` で指定した要素を文字列として 136 doument に格納します.デフォルトの `key` は 'text' です. 137 138 Json の読み込み, あるいは `key` の読み込みに失敗した際には例外を送出します. 139 これらを無視する場合は, `ignore=True` にします. その際, 読み込みに失敗 140 したドキュメントは破棄されます. 141 """ 142 143 def __init__( 144 self, 145 key: str = "text", 146 ignore: bool = False, 147 extra_keys: Optional[List[str]] = None, 148 *args: Any, 149 **kwargs: Any, 150 ) -> None: 151 super().__init__(*args, **kwargs) 152 self.key = key 153 self.ignore = ignore 154 self.extra_keys = extra_keys 155 156 def apply(self, document: Document) -> Document: 157 """ 158 >>> JSONLoader()( '{"text": "hello, world", "words": 2}' ) 159 'hello, world' 160 161 >>> JSONLoader()( '{"text": hello, world ....' ) # Broken JSON 162 Traceback (most recent call last): 163 ... 164 json.decoder.JSONDecodeError: Expecting value: line 1 column 10 (char 9) 165 166 >>> JSONLoader()( '{"words": 2}' ) 167 Traceback (most recent call last): 168 ... 169 KeyError: 'text' 170 171 >>> JSONLoader(ignore=True).apply(Document('{"text": hello, world ....' )).is_rejected 172 True 173 """ 174 try: 175 data = json.loads(document.text) 176 document.text = str(data[self.key]) 177 if self.extra_keys is not None: 178 document.extras = {key: data[key] for key in self.extra_keys if key in data} 179 except Exception as e: 180 logger.error(f"Failed to parsing in JSONLoader. Input document: \n{document.text}") 181 if self.ignore: 182 document.is_rejected = True 183 return document 184 else: 185 raise e 186 187 return document 188 189 190class JSONDumper(Filter): 191 """ 192 Document.text の文字列を json に変換します. 193 必要に応じ Document のメタデータを付与します. これはドキュメントの破棄事由が含まれ、偽陽性の分析に有効です。 194 デフォルトで `skip_rejected` が `False` にセットされており、Document の破棄フラグにかかわらず 195 処理されます。 196 """ 197 198 def __init__( 199 self, 200 dump_reason: bool = False, 201 p: float = 1, 202 skip_rejected: bool = False, 203 export_extras: bool = False, 204 *args: Any, 205 **kwargs: Any, 206 ) -> None: 207 """ 208 Args: 209 dump_reason (bool, optional): `is_rejected`, `reason` エントリをダンプします. Defaults to False. 210 p (float, optional): Apply probability. Defaults to 1. 211 skip_rejected (bool, optional): 破棄済みサンプルを排除しません. 212 """ 213 super().__init__(p, skip_rejected, *args, **kwargs) 214 self.dump_reason = dump_reason 215 self.export_extras = export_extras 216 217 def apply(self, document: Document) -> Document: 218 """ 219 >>> JSONDumper()("hojichar") 220 '{"text": "hojichar"}' 221 """ 222 text = document.text 223 if self.dump_reason: 224 if self.export_extras: 225 output_extras = {k: v for k, v in document.extras.items() if k != "__init_stats"} 226 document.text = json.dumps( 227 { 228 "text": text, 229 "is_rejected": document.is_rejected, 230 "reason": document.reject_reason, 231 "extras": output_extras, 232 }, 233 ensure_ascii=False, 234 ) 235 else: 236 document.text = json.dumps( 237 { 238 "text": text, 239 "is_rejected": document.is_rejected, 240 "reason": document.reject_reason, 241 }, 242 ensure_ascii=False, 243 ) 244 else: 245 if self.export_extras: 246 output_extras = {k: v for k, v in document.extras.items() if k != "__init_stats"} 247 document.text = json.dumps( 248 { 249 "text": text, 250 "extras": output_extras, 251 }, 252 ensure_ascii=False, 253 ) 254 else: 255 document.text = json.dumps({"text": text}, ensure_ascii=False) 256 return document 257 258 259class DocumentLengthFilter(Filter): 260 """ 261 `min_doc_len`, `max_doc_len` で指定した上限・下限の範囲内にないドキュメントを破棄します. 262 デフォルトでは 200字 以上 50000字以内のテキストが受理されます. 263 """ 264 265 def __init__( 266 self, 267 min_doc_len: Optional[int] = None, 268 max_doc_len: Optional[int] = None, 269 *args: Any, 270 **kwargs: Any, 271 ) -> None: 272 super().__init__(*args, **kwargs) 273 274 self.min_doc_len = min_doc_len 275 self.max_doc_len = max_doc_len 276 277 def apply(self, doc: Document) -> Document: 278 """ 279 >>> DocumentLengthFilter(min_doc_len=5).apply(Document("1234")).is_rejected 280 True 281 """ 282 doc_len = len(doc.text) 283 if self.min_doc_len is not None: 284 if doc_len < self.min_doc_len: 285 doc.is_rejected = True 286 if self.max_doc_len is not None: 287 if self.max_doc_len < doc_len: 288 doc.is_rejected = True 289 return doc 290 291 292class NgWordsFilterJa(Filter): 293 """ 294 日本語のNGワード(および不適切語)を含む文書を破棄します. 295 `dict_path` で指定したファイルから, キーワードのリストを得ます. 296 ファイルは単語が改行で羅列されたテキストファイルです. 297 298 `ignore_confused` を `True` にすると, 299 偽陽性を軽減するために, カタカナのNGワードは前後にカタカナが無い場合のみNG判定されます. 300 デフォルト値は `False` です. 301 """ 302 303 def __init__( 304 self, 305 dict_path: Union[str, PathLike], 306 ignore_confused: bool = False, 307 *args: Any, 308 **kwargs: Any, 309 ) -> None: 310 super().__init__(*args, **kwargs) 311 312 with open(dict_path, encoding="utf-8") as fp: 313 ng_words = fp.read().split("\n") 314 ng_words = [w.strip() for w in ng_words if not len(w) == 0] 315 316 if ignore_confused: 317 words_katakana = [] 318 words_not_katakana = [] 319 for w in ng_words: 320 if re.fullmatch(r"[ァ-ヴー]+", w): 321 words_katakana.append(re.escape(w)) 322 else: 323 words_not_katakana.append(re.escape(w)) 324 katakana_pat = "|".join(words_katakana) 325 katakana_pat = rf"(?<![ァ-ヴー])({katakana_pat})(?![ァ-ヴー])" 326 pat = "|".join(words_not_katakana) + "|" + katakana_pat 327 self.keyword_pat = re.compile(pat) 328 else: 329 ng_words = [re.escape(w) for w in ng_words] 330 pat = "|".join(ng_words) 331 self.keyword_pat = re.compile(pat) 332 333 def apply(self, doc: Document) -> Document: 334 regex_match = self.keyword_pat.search(doc.text) 335 if regex_match: 336 doc.is_rejected = True 337 self.matched_text = regex_match.group() 338 self.matched_text_neighbor = doc.text[ 339 regex_match.start() - 20 : regex_match.end() + 20 340 ] 341 342 return doc 343 344 345class NgWordsFilterEn(Filter): 346 """ 347 英語のNGワード(および不適切語)を含む文書を破棄します. 348 `dict_path` で指定したファイルから, キーワードのリストを得ます. 349 ファイルは単語が改行で羅列されたテキストファイルです. 350 """ 351 352 def __init__(self, dict_path: Union[str, PathLike], *args: Any, **kwargs: Any) -> None: 353 super().__init__(*args, **kwargs) 354 355 with open(dict_path, encoding="utf-8") as fp: 356 ng_words = fp.read().split("\n") 357 ng_words = [re.escape(w.strip()) for w in ng_words if not len(w) == 0] 358 pat = "|".join(ng_words) 359 # 英語のパターンにマッチするようにしている, \s[単語]\s や [単語]. [単語], などにマッチ. 360 self.keyword_pat = re.compile(rf"(?:^| )({pat})(?:( |,|\.)|$)", re.IGNORECASE) 361 362 def apply(self, doc: Document) -> Document: 363 if self.keyword_pat.search(doc.text): 364 doc.is_rejected = True 365 return doc 366 367 368class DiscardAdultContentJa(NgWordsFilterJa): 369 """ 370 日本語のアダルトキーワード(および不適切語)を含む文書を破棄します. 371 `dict_path` で指定したファイルから, キーワードのリストを得ます. 372 ファイルは単語が改行で羅列されたテキストファイルです. 373 デフォルトの`dict_path` は /hojichar/dict/adult_keywords_ja.txt です. 374 """ 375 376 def __init__( 377 self, 378 dict_path: Union[str, PathLike] = BASE_PATH / "dict/adult_keywords_ja.txt", 379 *args: Any, 380 **kwargs: Any, 381 ) -> None: 382 super().__init__(dict_path, *args, **kwargs) 383 384 def apply(self, doc: Document) -> Document: 385 """ 386 >>> DiscardAdultContentJa().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected 387 True 388 389 >>> DiscardAdultContentJa().apply(Document("ほうじ茶")).is_rejected 390 False 391 392 挙動は正しいが誤検知しているケース. 他にも, サック in リュックサック, 393 >>> DiscardAdultContentJa().apply(Document("アスパラガス")).is_rejected \ 394 # Matching with NG keyword "アス" 395 True 396 """ 397 return super().apply(doc) 398 399 400class DiscardAdultContentEn(NgWordsFilterEn): 401 """ 402 英語のアダルトキーワード(および不適切語)を含む文書を破棄します. 403 `dict_path` で指定したファイルから, キーワードのリストを得ます. 404 ファイルは単語が改行で羅列されたテキストファイルです. 405 デフォルトの`dict_path` は /hojichar/dict/adult_keywords_en.txt です. 406 """ 407 408 def __init__( 409 self, 410 dict_path: Union[str, PathLike] = BASE_PATH / "dict/adult_keywords_en.txt", 411 *args: Any, 412 **kwargs: Any, 413 ) -> None: 414 super().__init__(dict_path, *args, **kwargs) 415 416 def apply(self, doc: Document) -> Document: 417 """ 418 >>> DiscardAdultContentEn().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected 419 True 420 421 >>> DiscardAdultContentEn().apply(Document("hojichar")).is_rejected 422 False 423 """ 424 return super().apply(doc) 425 426 427class DiscardDiscriminationContentJa(NgWordsFilterJa): 428 """ 429 日本語の差別キーワード(および不適切語)を含む文書を破棄します. 430 `dict_path` で指定したファイルから, キーワードのリストを得ます. 431 ファイルは単語が改行で羅列されたテキストファイルです. 432 デフォルトの`dict_path` は /hojichar/dict/discrimination_keywords_ja.txt です. 433 """ 434 435 def __init__( 436 self, 437 dict_path: Union[str, PathLike] = BASE_PATH / "dict/discrimination_keywords_ja.txt", 438 *args: Any, 439 **kwargs: Any, 440 ): 441 super().__init__(dict_path, *args, **kwargs) 442 443 def apply(self, doc: Document) -> Document: 444 """ 445 >>> DiscardDiscriminationContentJa().\ 446 apply(Document("<TEST_STRING_OF_DISCRIMINATION_KEYWORD>")).is_rejected 447 True 448 449 >>> DiscardDiscriminationContentJa().apply(Document("ほうじ茶")).is_rejected 450 False 451 """ 452 return super().apply(doc) 453 454 455class DiscardViolenceContentJa(NgWordsFilterJa): 456 """ 457 日本語の暴力・脅迫を示唆するキーワードを含む文書を破棄します. 458 `dict_path` で指定したファイルから, キーワードのリストを得ます. 459 ファイルは単語が改行で羅列されたテキストファイルです. 460 デフォルトの`dict_path` は /hojichar/dict/violence_keywords_ja.txt です. 461 """ 462 463 def __init__( 464 self, 465 dict_path: Union[str, PathLike] = BASE_PATH / "dict/violence_keywords_ja.txt", 466 *args: Any, 467 **kwargs: Any, 468 ) -> None: 469 super().__init__(dict_path, *args, **kwargs) 470 471 def apply(self, doc: Document) -> Document: 472 """ 473 >>> DiscardViolenceContentJa()\ 474 .apply(Document("<TEST_STRING_OF_VIOLENCE_KEYWORD>")).is_rejected 475 True 476 477 >>> DiscardViolenceContentJa().apply(Document("ほうじ茶")).is_rejected 478 False 479 """ 480 return super().apply(doc) 481 482 483class DiscardBBSComments(Filter): 484 """ 485 正規表現 "BBS Pattern" に `max_allow_num` 回よりたくさんマッチする文書を破棄します. 486 `max_allow_num` のデフォルト値は14です. 487 正規表現 "BBS Pattern" は下記のリンクで検証可能です. 488 https://regex101.com/r/ybQvL2/1 489 """ 490 491 def __init__(self, max_allowed_num: int = 14, *args: Any, **kwargs: Any) -> None: 492 super().__init__(*args, **kwargs) 493 494 self.max_allowed_num = max_allowed_num 495 self.keyword_pat = re.compile( 496 r"\d{4}[年\.\-\/][\ ]*\d{1,2}[月\.\-\/][\ ]*\d{1,2}[日]*|コメント|SOLD OUT|レビュー|投稿|ページ|\([月火水木金土日]\)|質問|\d+話|楽天市場|-" # noqa 497 ) 498 499 def apply(self, doc: Document) -> Document: 500 """ 501 >>> DiscardBBSComments().apply(Document("楽天市場 質問 投稿 コメント レビュー "*3)).is_rejected 502 True 503 504 >>> DiscardBBSComments().apply(Document("鏡餅")).is_rejected 505 False 506 """ 507 bbs_factor = self.keyword_pat.findall(doc.text) 508 if len(bbs_factor) > self.max_allowed_num: 509 doc.is_rejected = True 510 return doc 511 512 513class DiscardAds(Filter): 514 """ 515 主に広告キーワードを`max_allow_num`より多く含む文書を破棄します. 516 デフォルトで`max_allow_num` は14です. 517 `dict_path` で指定したファイルから, 広告キーワードのリストを得ます. 518 ファイルは単語が改行で羅列されたテキストファイルです. 519 デフォルトの`dict_path` は /hojichar/dict/advertisement_keywords_ja.txt です. 520 """ 521 522 def __init__( 523 self, 524 dict_path: Union[str, PathLike] = BASE_PATH / "dict/advertisement_keywords_ja.txt", 525 max_allowed_num: int = 14, 526 *args: Any, 527 **kwargs: Any, 528 ): 529 super().__init__(*args, **kwargs) 530 531 self.max_allow_num = max_allowed_num 532 with open(dict_path, encoding="utf-8") as fp: 533 ng_words = fp.read().split("\n") 534 ng_words = [re.escape(w.strip()) for w in ng_words if not len(w) == 0] 535 pat = r"|".join(ng_words) 536 self.keyword_pat = re.compile(pat) 537 538 def apply(self, doc: Document) -> Document: 539 """ 540 >>> DiscardAds().apply(Document("お問い合わせください 営業時間 よくある質問"*5)).is_rejected 541 True 542 543 >>> DiscardAds().apply(Document("おはよう")).is_rejected 544 False 545 """ 546 ads_factor = self.keyword_pat.findall(doc.text) 547 if len(ads_factor) > self.max_allow_num: 548 doc.is_rejected = True 549 return doc 550 551 552class AcceptJapanese(Filter): 553 """ 554 日本語でないドキュメントを破棄します. 日本語判定は次の手順で行われます. 555 1. テキストを左から`lookup_size` (デフォルトで50字) 参照し, 556 ひらがな・カタカナが存在すれば日本語と判定する. 557 """ 558 559 def __init__(self, lookup_size: int = 50, *args: Any, **kwargs: Any) -> None: 560 super().__init__(*args, **kwargs) 561 562 self.lookup_size = lookup_size 563 self.hiragana_katakana_pat = re.compile(r"[ぁ-んァ-ン]") 564 565 def apply(self, doc: Document) -> Document: 566 """ 567 >>> AcceptJapanese().apply(Document("This is English document")).is_rejected 568 True 569 570 >>> AcceptJapanese().apply(Document("a"*50 + "あ")).is_rejected 571 True 572 573 >>> AcceptJapanese().apply(Document("ほうじ茶")).is_rejected 574 False 575 """ 576 if not self.hiragana_katakana_pat.search(doc.text[: self.lookup_size]): 577 doc.is_rejected = True 578 return doc 579 580 581class DiscardRareKuten(Filter): 582 """ 583 日本語でないドキュメントを破棄します. 日本語判定は次の手順で行われます 584 ドキュメントを句点"。"で区切り, 平均文長が 585 `max_avarage_sentence_length` より長い場合は破棄します. 586 `max_avarage_sentence_length` のデフォルト値は100です. 587 このフィルタは, 文章中の句点の割合が少なすぎるドキュメントを破棄します. 588 """ 589 590 def __init__(self, max_average_sentence_length: int = 100, *args: Any, **kwargs: Any) -> None: 591 super().__init__(*args, **kwargs) 592 593 self.max_average_sentence_length = max_average_sentence_length 594 self.kuten_pat = re.compile(r"。") 595 596 def apply(self, doc: Document) -> Document: 597 """ 598 >>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよ。")).is_rejected 599 False 600 >>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよう。")).is_rejected 601 True 602 """ 603 kuten_lst = self.kuten_pat.findall(doc.text) 604 min_kuten_num = len(doc.text) / self.max_average_sentence_length 605 if len(kuten_lst) < min_kuten_num: 606 doc.is_rejected = True 607 return doc 608 609 610class HeaderFooterTagsRemover(Filter): 611 """ 612 ドキュメントの冒頭・末尾のトークンを調査し, ヘッダー・フッダー的な 613 タグが存在していた場合, そのトークンを除去します. 614 615 このフィルタを通す前に, 事前にセンテンスレベルにトーカナイズしておいてください. 616 このフィルタでは Document.token にのみ変更が加えられるので, 出力前 あるいは 下流フィルタで 617 Document.text に変更を加える前にトークンをマージしておいてください. 618 """ 619 620 def __init__( 621 self, 622 dict_path: Union[str, PathLike] = BASE_PATH / "dict/header_footer_keywords_ja.txt", 623 *args: Any, 624 **kwargs: Any, 625 ) -> None: 626 super().__init__(*args, **kwargs) 627 628 with open(dict_path) as fp: 629 keywords = fp.read().split("\n") 630 keywords = [re.escape(w.strip()) for w in keywords if not len(w) == 0] 631 self.keyword_pat = re.compile(r"|".join(keywords)) 632 633 def apply(self, doc: Document) -> Document: 634 if len(doc.tokens) == 0: 635 return doc 636 637 lookup_size = 0 638 if 1 <= len(doc.tokens) < 4: 639 lookup_size = 1 640 elif 4 <= len(doc.tokens) < 6: 641 lookup_size = 2 642 elif 6 <= len(doc.tokens): 643 lookup_size = 3 644 645 for i in range(lookup_size): 646 if self.should_drop_token(doc.tokens[i]): 647 doc.tokens[i].is_rejected = True 648 if self.should_drop_token(doc.tokens[-(i + 1)]): 649 doc.tokens[i].is_rejected = True 650 651 return doc 652 653 def should_drop_token(self, token: Token) -> bool: 654 """ 655 >>> HeaderFooterTagsRemover().should_drop_token(Token("<TEST_STRING_OF_KEYWORD>")) 656 True 657 658 >>> HeaderFooterTagsRemover().should_drop_token(Token("ほうじ茶")) 659 False 660 661 Comment. 662 Original legacy code removed a pattern r"« _ | Main | _ »" . 663 In the pattern, "|" is not escaped, so **ANY** string was eliminated. 664 It seems unintended behavior, so I fix this. 665 """ 666 if self.keyword_pat.match(token.text): 667 return True 668 else: 669 return False 670 671 672class MaskPersonalInformation(Filter): 673 """ 674 ドキュメントに含まれる電話番号・電子メールアドレスを一部マスキングします. 675 """ 676 677 def __init__(self, *args: Any, **kwargs: Any) -> None: 678 super().__init__(*args, **kwargs) 679 680 self.phone_pat = re.compile( 681 r"((0|\+\d{1,3}[- ]?)(\d{2}[- ]?\d{4}[- ]?|\d[- ]?\d{4}[- ]?|\d{2}[- ]?\d{3}[- ]?|\d{3}[- ]?\d{2}[- ]?|\d{4}[- ]?\d{1}[- ]?))\d{4}" # noqa 682 ) 683 self.email_pat = re.compile( 684 r"[a-zA-Z0-9!#$%&'*+\-/=?^_`{|}~.]+@[A-Za-z0-9!#$%&'*+\-/=?^_`{|}~.]+(\.[A-Za-z0-9\-]+)" # noqa 685 ) 686 687 def apply(self, doc: Document) -> Document: 688 """ 689 >>> MaskPersonalInformation()('06-1234-5678') 690 '06-1234-XXXX' 691 >>> MaskPersonalInformation()('075-123-4567') 692 '075-123-XXXX' 693 >>> MaskPersonalInformation()('0166-12-3456') 694 '0166-12-XXXX' 695 >>> MaskPersonalInformation()('09808-1-2345') 696 '09808-1-XXXX' 697 >>> MaskPersonalInformation()('090-1234-5678') 698 '090-1234-XXXX' 699 >>> MaskPersonalInformation()('0751234567') 700 '075123XXXX' 701 >>> MaskPersonalInformation()('08012345678') 702 '0801234XXXX' 703 >>> MaskPersonalInformation()('連絡は075-123-4567 まで') 704 '連絡は075-123-XXXX まで' 705 >>> MaskPersonalInformation()('+81-80-1234-5678') 706 '+81-80-1234-XXXX' 707 >>> MaskPersonalInformation()('+818012345678') 708 '+81801234XXXX' 709 >>> MaskPersonalInformation()('hogehoge@example.com') 710 'xxxx@yyy.com' 711 >>> MaskPersonalInformation()('何かあれば hogehoge@example.ne.jp まで連絡') 712 '何かあれば xxxx@yyy.jp まで連絡' 713 """ 714 text = self.phone_pat.sub(r"\1XXXX", doc.text) 715 text = self.email_pat.sub(r"xxxx@yyy\1", text) 716 doc.text = text 717 return doc 718 719 720class DiscardTooManyNouns(Filter): 721 """ 722 [!CAUTION] This filter requires `fugashi` package. Please install it 723 by `pip install 'hojichar[all]'`. 724 725 A filter that removes document with too many nouns in Japanese i.e., 726 documents such as advertisement, word salad, etc ... 727 """ 728 729 def __init__( 730 self, threshold: float = 0.80, max_parse_chars: int = 100_000, *args: Any, **kwargs: Any 731 ) -> None: 732 """ 733 Args: 734 threshold: document whose noun ratio is higher than this value will be discarded 735 max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document. 736 *args: 737 **kwargs: 738 """ 739 super().__init__(*args, **kwargs) 740 assert is_loaded_extras, ( 741 "fugashi is required for this filter. Try pip install 'hojichar[all]'" 742 ) 743 744 self.threshold = threshold 745 self.max_parse_chars = max_parse_chars 746 self.tagger = Tagger("-Owakati") 747 assert "unidic" in self.tagger.dictionary_info[0]["filename"], ( 748 "MeCab dictionary must be unidic" 749 ) 750 751 def _chunk_text(self, text: str) -> Iterable[str]: 752 """Slice text into chunks of `max_parse_chars` length.""" 753 step = self.max_parse_chars 754 for i in range(0, len(text), step): 755 yield text[i : i + step] 756 757 def apply(self, doc: Document) -> Document: 758 """ 759 >>> DiscardTooManyNouns().apply(Document("自然言語処理大好き!")).is_rejected 760 False 761 >>> DiscardTooManyNouns().apply(Document("リンゴ・オレンジ・ミカン・バナナ セール中")).is_rejected 762 True 763 >>> DiscardTooManyNouns().apply(Document("今日の仙台朝市ではリンゴがセール中")).is_rejected 764 False 765 """ 766 # remove "補助記号" from part-of-speech statistics 767 # because they often decrease the noun ratio, 768 # e.g., the sentence "リンゴ・オレンジ・バナナ・" has 補助記号 ratio of 0.5 769 # however, we don't want such sentence 770 771 pos_count: Counter[str] = Counter() 772 for chunk in self._chunk_text(doc.text): 773 for word in self.tagger(chunk): 774 if word.feature.pos1 != "補助記号": 775 pos_count[word.feature.pos1] += 1 776 777 try: 778 noun_ratio = pos_count["名詞"] / sum(pos_count.values()) 779 except ZeroDivisionError: 780 noun_ratio = 0.0 781 if noun_ratio >= self.threshold: 782 doc.is_rejected = True 783 return doc 784 785 786class CharRepetitionRatioFilter(Filter): 787 """ 788 文字Ngramの重なり率(文書中で高頻度文字Ngramが占める割合)を計算して, 重なりの大きいものを除去します. 789 名詞の連続からなるような広告テキストを取り除くのに有効です. 790 791 実装は, BigScience で採用されていた前処理を参考にしています. 792 元実装: https://github.com/bigscience-workshop/data-preparation/blob/9d0588419073cc5bf0fb92b58f37f2a1016572c3/preprocessing/training/01b_oscar_cleaning_and_filtering/filtering.py#L425-L453 # noqa: E501 793 794 「高頻度文字Ngram」は、sqrt(ユニークなNgramの総数)によって求めていますが, 795 これは文書長の影響を軽減するためだとされています. 796 797 掲示板のテキストが引っかかりやすい傾向があります. 798 13: 名無しさん@実況で競馬板アウト 2019/08/18(日) 15:28:46.10 ID:eBvZg8h+0 799 的なものが高頻度で登場するため、文字Ngramの重なり率も高くなってしまう 800 """ 801 802 def __init__( 803 self, threshold: float = 0.33, ngram_size: int = 5, *args: Any, **kwargs: Any 804 ) -> None: 805 """ 806 807 Args: 808 threshold: document with character repetition ratio higher than this value will be discarded 809 ngram_size: character ngram size. Larger value will decrease the false positive of long documents 810 *args: 811 **kwargs: 812 """ # noqa: E501 813 814 super().__init__(*args, **kwargs) 815 self.threshold = threshold 816 self.ngram_size = ngram_size 817 818 def apply(self, doc: Document) -> Document: 819 ratio = self.compute_character_repetition_ratio(doc.text, self.ngram_size) 820 if ratio >= self.threshold: 821 doc.is_rejected = True 822 return doc 823 824 @staticmethod 825 def compute_character_repetition_ratio( 826 document: str, character_repetition_length: int 827 ) -> float: 828 def get_freq_character_ngrams(document: str, n: int) -> Dict[str, int]: 829 character_ngrams: List[str] = [ 830 document[i : i + n] for i in range(len(document) - n + 1) 831 ] 832 freq_character_ngrams_dict: Dict[str, int] = {} 833 for character_ngram in character_ngrams: 834 freq_character_ngrams_dict[character_ngram] = ( 835 freq_character_ngrams_dict.get(character_ngram, 0) + 1 836 ) 837 return freq_character_ngrams_dict 838 839 freq_character_ngrams_dict = get_freq_character_ngrams( 840 document, character_repetition_length 841 ) 842 if len(freq_character_ngrams_dict) == 0: 843 return 0.0 844 freq_character_ngrams: List[int] = list(freq_character_ngrams_dict.values()) 845 freq_character_ngrams = sorted(freq_character_ngrams, reverse=True) 846 val_one = len([el for el in freq_character_ngrams if el == 1]) 847 num_rep_character_ngrams = min( 848 int(np.sqrt(len(freq_character_ngrams))), 849 len(freq_character_ngrams) - val_one, 850 ) 851 character_repetition_ratio = sum(freq_character_ngrams[:num_rep_character_ngrams]) / sum( 852 freq_character_ngrams 853 ) 854 return character_repetition_ratio 855 856 857class WordRepetitionRatioFilter(Filter): 858 """ 859 [!CAUTION] This filter requires `fugashi` package. Please install it 860 by `pip install 'hojichar[all]'`. 861 862 単語Ngramの重なり率(文書中で重複する単語Ngramが占める割合)を計算して、重なりの大きいものを弾くためのフィルタ. 863 BigScienceで採用されていた前処理を参考にしている. 864 865 名詞が連打されているような広告テキストを取り除くのに有効な様子 866 まともな文書がたまたま2回繰り返されている場合もあり、これを取り除いて良いのかは分からない 867 例: 868 "ウェブ\n本文: ニコンの上昇率16%超える、今3月期は経常76%の大幅増益見込む(ニコン) 2013年05月10日[minkabu PRESS] - みんなの株式 (みんかぶ)\n2013/05/10(10:57) 869 ニコン<7731.T>が急騰、寄り付き直後に前日比355円高の2537円まで買い上げ 870 られ、上昇率は16%を超えた。外国為替市場で円が1ドル100円台、1ユーロ131円台に入るなど急速に円安が進み、輸出株が軒並み高になる 871 なか、9日取引終了後に発表した前年3月期決算で、今3月期は2ケタ近い増収で大幅増益を見込んだことが買い気を強めさせた。連結売上 872 高は前期比9.8%増の1兆1100億円、経常利益75.8%増の850億円を予想。前期は半導体、電子部品の低迷が足かせになり、2ケタ増収ながら 873 経常46%の大幅減益になったが、レンズ交換式デジタルカメラの拡大や液晶ディスプレイの回復で収益が急回復する。ニコンの株価は10時 874 56分現在2491円(△309円)出所:株経通信(株式会社みんかぶ)\n2013/05/10 - ニコン(7731) の関連ニュース。 ニコン<7731.T>が急騰、寄 875 り付き直後に前日比355円高の2537円まで買い上げられ、上昇率は16%を超えた。外国為替市場で円が1ドル100円台、1ユーロ131円台に入 876 るなど急速に円安が進み、輸出株が軒並み高になるなか、9日取引終了後に発表した前年3月期決算で、今3月期は2ケタ近い増収で大幅増 877 益を見込んだことが買い気を強めさせた。連結売上高は前期比9.8%増の1兆1100億円、経常利益75.8%増の850億円を予想。前期は半導体、 878 電子部品の低迷が足かせになり、2ケタ増収ながら経常46%の大幅減益になったが、レンズ交換式デジタルカメラの拡大や液晶ディスプレ 879 イの回復で収益が急回" 880 """ # noqa: E501 881 882 def __init__( 883 self, 884 threshold: float = 0.40, 885 ngram_size: int = 7, 886 max_parse_chars: int = 100_000, 887 *args: Any, 888 **kwargs: Any, 889 ) -> None: 890 """ 891 892 Args: 893 threshold: document whose character repetition ratio is higher than this value will be discarded 894 ngram_size: character ngram size. Larger value will decrease the false positive of long documents 895 max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document. 896 *args: 897 **kwargs: 898 """ # noqa: E501 899 super().__init__(*args, **kwargs) 900 assert is_loaded_extras, ( 901 "fugashi is required for this filter. Try pip install 'hojichar[all]'" 902 ) 903 904 self.threshold = threshold 905 self.ngram_size = ngram_size 906 self.max_parse_chars = max_parse_chars 907 self.tagger = Tagger("-Owakati") 908 909 def _chunk_text(self, text: str) -> Iterable[str]: 910 """Split text into chunks of `max_parse_chars` length.""" 911 step = self.max_parse_chars 912 for i in range(0, len(text), step): 913 yield text[i : i + step] 914 915 def _get_freq_word_ngrams(self, words: List[str], n: int) -> Dict[str, int]: 916 freq: Dict[str, int] = {} 917 if n <= 0 or len(words) < n: 918 return freq 919 for i in range(len(words) - n + 1): 920 key = " ".join(words[i : i + n]) 921 freq[key] = freq.get(key, 0) + 1 922 return freq 923 924 def apply(self, doc: Document) -> Document: 925 ratio = self.compute_word_repetition_ratio(doc.text, self.ngram_size) 926 if ratio >= self.threshold: 927 doc.is_rejected = True 928 return doc 929 930 def compute_word_repetition_ratio(self, document: str, n: int) -> float: 931 total_counter: Counter[str] = Counter() 932 933 for chunk in self._chunk_text(document): 934 words = [w.surface for w in self.tagger(chunk)] 935 total_counter.update(self._get_freq_word_ngrams(words, n)) 936 937 if not total_counter: 938 return 0.0 939 940 total = sum(total_counter.values()) 941 repeated = sum(v for v in total_counter.values() if v > 1) 942 return repeated / total 943 944 945class DiscardTooManySpecialToken(Filter): 946 """ 947 [!CAUTION] This filter requires `emoji` package. Please install it 948 by `pip install 'hojichar[all]'`. 949 950 句読点を含む記号、空白、絵文字、その他特殊な文字を一定の割合以上含むような文書を取り除くためのフィルタ 951 元実装: BigScience https://github.com/bigscience-workshop/data-preparation/blob/9d0588419073cc5bf0fb92b58f37f2a1016572c3/preprocessing/training/01b_oscar_cleaning_and_filtering/parameters_filtering.py#L5-L16 # noqa: E501 952 """ 953 954 def __init__(self, threshold: float = 0.4, *args: Any, **kwargs: Any) -> None: 955 """ 956 957 Args: 958 threshold: document whose special token ratio is higher than this value will be discarded 959 *args: 960 **kwargs: 961 """ # noqa: E501 962 super().__init__(*args, **kwargs) 963 964 # digits are not regarded as special tokens 965 # otherwise many false positives are made, i.e., good documents discarded 966 main_special_characters = string.punctuation + string.whitespace # + string.digits 967 other_special_characters = ( 968 " ’“”–▬…✦�£•€«»°·═" 969 "×士^˘⇓()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰ ‑≤≥‖" 970 "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†:⁄♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚" 971 "゜ʼ≖ʼ¤℃√!?【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖" 972 "」﴾》�" 973 ) 974 975 en_emoji = emoji.EMOJI_DATA.keys() 976 977 special_characters_default = set(main_special_characters + other_special_characters) 978 special_characters_default.update(en_emoji) 979 self.special_characters = special_characters_default 980 981 self.threshold = threshold 982 983 def _compute_special_characters_ratio(self, text: str) -> float: 984 if len(text) == 0: 985 return 0 986 987 special_characters_ratio = len( 988 [char for char in text if char in self.special_characters] 989 ) / len(text) 990 return special_characters_ratio 991 992 def apply(self, doc: Document) -> Document: 993 special_characters_ratio = self._compute_special_characters_ratio(doc.text) 994 995 if special_characters_ratio > self.threshold: 996 doc.is_rejected = True 997 return doc 998 999 1000class SingleCharacterRepetitionFilter(Filter): 1001 """ 1002 単一文字が大量に繰り返されているような文書を取り除くためのフィルタ 1003 そのような文書はノイズである可能性が高いため 1004 参考: BigScienceプロジェクトによると、oscarデータセットの中にバックスラッシュだけを2M個含むような文書が含まれていたらしい 1005 https://github.com/bigscience-workshop/bigscience/blob/master/train/tr8-104B-wide/chronicles.md#2m-backslash-only-samples-in-our-dataset # noqa: E501 1006 """ 1007 1008 def __init__( 1009 self, 1010 threshold: int = 200, 1011 *args: Any, 1012 **kwargs: Any, 1013 ) -> None: 1014 """ 1015 Args: 1016 threshold: The document is removed if character is repeated for this value or more 1017 *args: 1018 **kwargs: 1019 """ 1020 super().__init__(*args, **kwargs) 1021 self.threshold = threshold 1022 1023 def _is_repeat_contained(self, text: str) -> bool: 1024 groups = groupby(text) 1025 is_repeat_contained = any(sum(1 for _ in group) >= self.threshold for _, group in groups) 1026 return is_repeat_contained 1027 1028 def apply(self, doc: Document) -> Document: 1029 if self._is_repeat_contained(doc.text): 1030 doc.is_rejected = True 1031 return doc 1032 1033 1034class DiscardTooManyEndingEllipsis(Filter): 1035 """ 1036 ellipsisで終わるような行が大量に含まれるような文書を取り除くためのフィルタです. 1037 ellipsisとしては ... と … を用いている 1038 同様のフィルタが RedPajama v2で用いられています. 1039 1040 例として, 以下のような文書を検知します. 1041 ``` 1042 ペアーズは女性、という驚愕の過食が出ているのをごアラサーですか。時代から付... 1043 バツイチアラフォー 婚活ち女性の特徴と子持な付... 1044 ``` 1045 1046 デフォルトではしきい値を0.7としているが, これはC4から0.1%を削るような設定であり、 1047 precisionを重視した設定です. 1048 """ 1049 1050 def __init__( 1051 self, 1052 threshold: float = 0.7, 1053 *args: Any, 1054 **kwargs: Any, 1055 ) -> None: 1056 """ 1057 Args: 1058 threshold: The document is removed if ratio of lines ending with ellipsis is higher than this value 1059 *args: 1060 **kwargs: 1061 """ # noqa: E501 1062 super().__init__(*args, **kwargs) 1063 self.threshold = threshold 1064 self.ellipsis_pattern = re.compile(r"(\.{3}|…)\n") # matches ...\n and …\n 1065 1066 def apply(self, doc: Document) -> Document: 1067 ellipsis_count = len(self.ellipsis_pattern.findall(doc.text)) 1068 newline_count = max(doc.text.count("\n"), 1) # avoid zero division 1069 ellipsis_ratio = ellipsis_count / newline_count 1070 1071 if ellipsis_ratio > self.threshold: 1072 doc.is_rejected = True 1073 return doc 1074 1075 1076class DiscardTooShortLines(Filter): 1077 """ 1078 短い行を大量に含む文書を捨てるためのフィルタです. 1079 1080 メニューバーやパンくずリストのような要素を大量に含む文書を取り除くのに有効です. 1081 """ 1082 1083 def __init__(self, threshold: float = 0.5, *args: Any, **kwargs: Any) -> None: 1084 """ 1085 Args: 1086 threshold: The document is removed if the ratio of short (<10 chars) lines are more than this value. 1087 *args: 1088 **kwargs: 1089 """ # noqa: E501 1090 super().__init__(*args, **kwargs) 1091 self.threshold = threshold 1092 # この値は適当に決め打ち 1093 self.minimum_line_length = 10 1094 1095 def apply(self, doc: Document) -> Document: 1096 lines = [len(x) for x in doc.text.split("\n")] 1097 short_lines = [x for x in lines if x <= self.minimum_line_length] 1098 if (len(short_lines) / len(lines)) > self.threshold: 1099 doc.is_rejected = True 1100 return doc
32class ExampleHojiChar(Filter): 33 """基本的なフィルタの実装例です. 末尾に'<hojichar>'を追加します.""" 34 35 def apply(self, document: Document) -> Document: 36 """ 37 >>> ExampleHojiChar()("hello, world") 38 'hello, world<hojichar>' 39 """ 40 document.text += "<hojichar>" 41 return document
基本的なフィルタの実装例です. 末尾に'
44class ExampleDiscardDocumentContainKeyword(Filter): 45 """特定のキーワードを持つドキュメントを破棄するようなフィルタの実装例です.""" 46 47 def __init__(self, keyword: str, *args: Any, **kwargs: Any) -> None: 48 super().__init__(*args, **kwargs) 49 self.keyword = keyword 50 51 def apply(self, document: Document) -> Document: 52 """ 53 >>> ExampleDiscardDocumentContainKeyword("バカ").apply(Document("あいつはバカだ")).is_rejected 54 True 55 """ 56 if self.keyword in document.text: 57 document.is_rejected = True 58 return document
特定のキーワードを持つドキュメントを破棄するようなフィルタの実装例です.
47 def __init__(self, keyword: str, *args: Any, **kwargs: Any) -> None: 48 super().__init__(*args, **kwargs) 49 self.keyword = keyword
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p
is 1, the filter will always be applied.
skip_rejected : bool
If True
, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False
.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None
, a new random number generator will be created.
If None
, and use in the Compose
class, the random state is shared with the Compose
object.
use_batch : bool
If True
, the filter will process documents in batches in the apply_stream
method.
batch_size : int
The size of the batch to process documents in the apply_stream
method.
kwargs : Any
Additional keyword arguments to pass to the filter.
51 def apply(self, document: Document) -> Document: 52 """ 53 >>> ExampleDiscardDocumentContainKeyword("バカ").apply(Document("あいつはバカだ")).is_rejected 54 True 55 """ 56 if self.keyword in document.text: 57 document.is_rejected = True 58 return document
>>> ExampleDiscardDocumentContainKeyword("バカ").apply(Document("あいつはバカだ")).is_rejected
True
61class Identity(Filter): 62 """何も変化を加えないフィルタです. テスト・デバッグに用いられます.""" 63 64 def apply(self, document: Document) -> Document: 65 return document
何も変化を加えないフィルタです. テスト・デバッグに用いられます.
Definition of filter behavior.
The document must have a protocol TextContent
,
and mostly used hojichar.Document class.
In this method, the filter will modify document.text
or
document.extras
and set document.is_rejected = True
to discard the document.
Parameters
document : Document Input document
Returns
Document Processed Document
68class DiscardAll(Filter): 69 """ 70 すべてのドキュメントを破棄するフィルタです. 71 テスト・デバッグに用いられます. 72 """ 73 74 def apply(self, document: Document) -> Document: 75 document.is_rejected = True 76 return document
すべてのドキュメントを破棄するフィルタです. テスト・デバッグに用いられます.
74 def apply(self, document: Document) -> Document: 75 document.is_rejected = True 76 return document
Definition of filter behavior.
The document must have a protocol TextContent
,
and mostly used hojichar.Document class.
In this method, the filter will modify document.text
or
document.extras
and set document.is_rejected = True
to discard the document.
Parameters
document : Document Input document
Returns
Document Processed Document
79class ApplyDiscard(Filter): 80 """ 81 上流フィルタで破棄された`Document`を空文字列にします. 82 83 `Document.is_rejected=True` の ドキュメントは無視されるため, 84 このフィルタを `Compose` のコンストラクタに渡しても動作しません. 85 このフィルタは主に`Compose` 内部や, `discard_filtered=False` を指定 86 したデバッグ時などに利用されます. 87 """ 88 89 def __init__(self, *args: Any, **kwargs: Any) -> None: 90 super().__init__(*args, **kwargs) 91 92 def apply(self, document: Document) -> Document: 93 """ 94 >>> ApplyDiscard().apply(Document(text="hello", is_rejected=True)).text 95 '' 96 """ 97 if document.is_rejected: 98 document.text = "" 99 100 return document
上流フィルタで破棄されたDocument
を空文字列にします.
Document.is_rejected=True
の ドキュメントは無視されるため,
このフィルタを Compose
のコンストラクタに渡しても動作しません.
このフィルタは主にCompose
内部や, discard_filtered=False
を指定
したデバッグ時などに利用されます.
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p
is 1, the filter will always be applied.
skip_rejected : bool
If True
, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False
.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None
, a new random number generator will be created.
If None
, and use in the Compose
class, the random state is shared with the Compose
object.
use_batch : bool
If True
, the filter will process documents in batches in the apply_stream
method.
batch_size : int
The size of the batch to process documents in the apply_stream
method.
kwargs : Any
Additional keyword arguments to pass to the filter.
103class Sleep(Filter): 104 """ 105 デバッグ用のフィルタです. 指定秒スリープします. 106 """ 107 108 def __init__(self, time: float = 1.0, *args: Any, **kwargs: Any) -> None: 109 super().__init__(*args, **kwargs) 110 self.time = time 111 112 def apply(self, document: Document) -> Document: 113 """ 114 >>> Sleep(0.1)('hello') # After 0.1 seconds, 115 'hello' 116 """ 117 time.sleep(self.time) 118 return document
デバッグ用のフィルタです. 指定秒スリープします.
108 def __init__(self, time: float = 1.0, *args: Any, **kwargs: Any) -> None: 109 super().__init__(*args, **kwargs) 110 self.time = time
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p
is 1, the filter will always be applied.
skip_rejected : bool
If True
, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False
.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None
, a new random number generator will be created.
If None
, and use in the Compose
class, the random state is shared with the Compose
object.
use_batch : bool
If True
, the filter will process documents in batches in the apply_stream
method.
batch_size : int
The size of the batch to process documents in the apply_stream
method.
kwargs : Any
Additional keyword arguments to pass to the filter.
121class DocumentNormalizer(Filter): 122 """ 123 Unicode の正規化をします. 124 """ 125 126 def __init__(self, *args: Any, **kwargs: Any) -> None: 127 super().__init__(*args, **kwargs) 128 129 def apply(self, document: Document) -> Document: 130 document.text = unicodedata.normalize("NFKC", document.text) 131 return document
Unicode の正規化をします.
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p
is 1, the filter will always be applied.
skip_rejected : bool
If True
, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False
.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None
, a new random number generator will be created.
If None
, and use in the Compose
class, the random state is shared with the Compose
object.
use_batch : bool
If True
, the filter will process documents in batches in the apply_stream
method.
batch_size : int
The size of the batch to process documents in the apply_stream
method.
kwargs : Any
Additional keyword arguments to pass to the filter.
129 def apply(self, document: Document) -> Document: 130 document.text = unicodedata.normalize("NFKC", document.text) 131 return document
Definition of filter behavior.
The document must have a protocol TextContent
,
and mostly used hojichar.Document class.
In this method, the filter will modify document.text
or
document.extras
and set document.is_rejected = True
to discard the document.
Parameters
document : Document Input document
Returns
Document Processed Document
134class JSONLoader(Filter): 135 """ 136 テキストを Json として解釈し, `key` で指定した要素を文字列として 137 doument に格納します.デフォルトの `key` は 'text' です. 138 139 Json の読み込み, あるいは `key` の読み込みに失敗した際には例外を送出します. 140 これらを無視する場合は, `ignore=True` にします. その際, 読み込みに失敗 141 したドキュメントは破棄されます. 142 """ 143 144 def __init__( 145 self, 146 key: str = "text", 147 ignore: bool = False, 148 extra_keys: Optional[List[str]] = None, 149 *args: Any, 150 **kwargs: Any, 151 ) -> None: 152 super().__init__(*args, **kwargs) 153 self.key = key 154 self.ignore = ignore 155 self.extra_keys = extra_keys 156 157 def apply(self, document: Document) -> Document: 158 """ 159 >>> JSONLoader()( '{"text": "hello, world", "words": 2}' ) 160 'hello, world' 161 162 >>> JSONLoader()( '{"text": hello, world ....' ) # Broken JSON 163 Traceback (most recent call last): 164 ... 165 json.decoder.JSONDecodeError: Expecting value: line 1 column 10 (char 9) 166 167 >>> JSONLoader()( '{"words": 2}' ) 168 Traceback (most recent call last): 169 ... 170 KeyError: 'text' 171 172 >>> JSONLoader(ignore=True).apply(Document('{"text": hello, world ....' )).is_rejected 173 True 174 """ 175 try: 176 data = json.loads(document.text) 177 document.text = str(data[self.key]) 178 if self.extra_keys is not None: 179 document.extras = {key: data[key] for key in self.extra_keys if key in data} 180 except Exception as e: 181 logger.error(f"Failed to parsing in JSONLoader. Input document: \n{document.text}") 182 if self.ignore: 183 document.is_rejected = True 184 return document 185 else: 186 raise e 187 188 return document
テキストを Json として解釈し, key
で指定した要素を文字列として
doument に格納します.デフォルトの key
は 'text' です.
Json の読み込み, あるいは key
の読み込みに失敗した際には例外を送出します.
これらを無視する場合は, ignore=True
にします. その際, 読み込みに失敗
したドキュメントは破棄されます.
144 def __init__( 145 self, 146 key: str = "text", 147 ignore: bool = False, 148 extra_keys: Optional[List[str]] = None, 149 *args: Any, 150 **kwargs: Any, 151 ) -> None: 152 super().__init__(*args, **kwargs) 153 self.key = key 154 self.ignore = ignore 155 self.extra_keys = extra_keys
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p
is 1, the filter will always be applied.
skip_rejected : bool
If True
, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False
.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None
, a new random number generator will be created.
If None
, and use in the Compose
class, the random state is shared with the Compose
object.
use_batch : bool
If True
, the filter will process documents in batches in the apply_stream
method.
batch_size : int
The size of the batch to process documents in the apply_stream
method.
kwargs : Any
Additional keyword arguments to pass to the filter.
157 def apply(self, document: Document) -> Document: 158 """ 159 >>> JSONLoader()( '{"text": "hello, world", "words": 2}' ) 160 'hello, world' 161 162 >>> JSONLoader()( '{"text": hello, world ....' ) # Broken JSON 163 Traceback (most recent call last): 164 ... 165 json.decoder.JSONDecodeError: Expecting value: line 1 column 10 (char 9) 166 167 >>> JSONLoader()( '{"words": 2}' ) 168 Traceback (most recent call last): 169 ... 170 KeyError: 'text' 171 172 >>> JSONLoader(ignore=True).apply(Document('{"text": hello, world ....' )).is_rejected 173 True 174 """ 175 try: 176 data = json.loads(document.text) 177 document.text = str(data[self.key]) 178 if self.extra_keys is not None: 179 document.extras = {key: data[key] for key in self.extra_keys if key in data} 180 except Exception as e: 181 logger.error(f"Failed to parsing in JSONLoader. Input document: \n{document.text}") 182 if self.ignore: 183 document.is_rejected = True 184 return document 185 else: 186 raise e 187 188 return document
>>> JSONLoader()( '{"text": "hello, world", "words": 2}' )
'hello, world'
>>> JSONLoader()( '{"text": hello, world ....' ) # Broken JSON
Traceback (most recent call last):
...
json.decoder.JSONDecodeError: Expecting value: line 1 column 10 (char 9)
>>> JSONLoader()( '{"words": 2}' )
Traceback (most recent call last):
...
KeyError: 'text'
>>> JSONLoader(ignore=True).apply(Document('{"text": hello, world ....' )).is_rejected
True
191class JSONDumper(Filter): 192 """ 193 Document.text の文字列を json に変換します. 194 必要に応じ Document のメタデータを付与します. これはドキュメントの破棄事由が含まれ、偽陽性の分析に有効です。 195 デフォルトで `skip_rejected` が `False` にセットされており、Document の破棄フラグにかかわらず 196 処理されます。 197 """ 198 199 def __init__( 200 self, 201 dump_reason: bool = False, 202 p: float = 1, 203 skip_rejected: bool = False, 204 export_extras: bool = False, 205 *args: Any, 206 **kwargs: Any, 207 ) -> None: 208 """ 209 Args: 210 dump_reason (bool, optional): `is_rejected`, `reason` エントリをダンプします. Defaults to False. 211 p (float, optional): Apply probability. Defaults to 1. 212 skip_rejected (bool, optional): 破棄済みサンプルを排除しません. 213 """ 214 super().__init__(p, skip_rejected, *args, **kwargs) 215 self.dump_reason = dump_reason 216 self.export_extras = export_extras 217 218 def apply(self, document: Document) -> Document: 219 """ 220 >>> JSONDumper()("hojichar") 221 '{"text": "hojichar"}' 222 """ 223 text = document.text 224 if self.dump_reason: 225 if self.export_extras: 226 output_extras = {k: v for k, v in document.extras.items() if k != "__init_stats"} 227 document.text = json.dumps( 228 { 229 "text": text, 230 "is_rejected": document.is_rejected, 231 "reason": document.reject_reason, 232 "extras": output_extras, 233 }, 234 ensure_ascii=False, 235 ) 236 else: 237 document.text = json.dumps( 238 { 239 "text": text, 240 "is_rejected": document.is_rejected, 241 "reason": document.reject_reason, 242 }, 243 ensure_ascii=False, 244 ) 245 else: 246 if self.export_extras: 247 output_extras = {k: v for k, v in document.extras.items() if k != "__init_stats"} 248 document.text = json.dumps( 249 { 250 "text": text, 251 "extras": output_extras, 252 }, 253 ensure_ascii=False, 254 ) 255 else: 256 document.text = json.dumps({"text": text}, ensure_ascii=False) 257 return document
Document.text の文字列を json に変換します.
必要に応じ Document のメタデータを付与します. これはドキュメントの破棄事由が含まれ、偽陽性の分析に有効です。
デフォルトで skip_rejected
が False
にセットされており、Document の破棄フラグにかかわらず
処理されます。
199 def __init__( 200 self, 201 dump_reason: bool = False, 202 p: float = 1, 203 skip_rejected: bool = False, 204 export_extras: bool = False, 205 *args: Any, 206 **kwargs: Any, 207 ) -> None: 208 """ 209 Args: 210 dump_reason (bool, optional): `is_rejected`, `reason` エントリをダンプします. Defaults to False. 211 p (float, optional): Apply probability. Defaults to 1. 212 skip_rejected (bool, optional): 破棄済みサンプルを排除しません. 213 """ 214 super().__init__(p, skip_rejected, *args, **kwargs) 215 self.dump_reason = dump_reason 216 self.export_extras = export_extras
Args:
dump_reason (bool, optional): is_rejected
, reason
エントリをダンプします. Defaults to False.
p (float, optional): Apply probability. Defaults to 1.
skip_rejected (bool, optional): 破棄済みサンプルを排除しません.
218 def apply(self, document: Document) -> Document: 219 """ 220 >>> JSONDumper()("hojichar") 221 '{"text": "hojichar"}' 222 """ 223 text = document.text 224 if self.dump_reason: 225 if self.export_extras: 226 output_extras = {k: v for k, v in document.extras.items() if k != "__init_stats"} 227 document.text = json.dumps( 228 { 229 "text": text, 230 "is_rejected": document.is_rejected, 231 "reason": document.reject_reason, 232 "extras": output_extras, 233 }, 234 ensure_ascii=False, 235 ) 236 else: 237 document.text = json.dumps( 238 { 239 "text": text, 240 "is_rejected": document.is_rejected, 241 "reason": document.reject_reason, 242 }, 243 ensure_ascii=False, 244 ) 245 else: 246 if self.export_extras: 247 output_extras = {k: v for k, v in document.extras.items() if k != "__init_stats"} 248 document.text = json.dumps( 249 { 250 "text": text, 251 "extras": output_extras, 252 }, 253 ensure_ascii=False, 254 ) 255 else: 256 document.text = json.dumps({"text": text}, ensure_ascii=False) 257 return document
>>> JSONDumper()("hojichar")
'{"text": "hojichar"}'
260class DocumentLengthFilter(Filter): 261 """ 262 `min_doc_len`, `max_doc_len` で指定した上限・下限の範囲内にないドキュメントを破棄します. 263 デフォルトでは 200字 以上 50000字以内のテキストが受理されます. 264 """ 265 266 def __init__( 267 self, 268 min_doc_len: Optional[int] = None, 269 max_doc_len: Optional[int] = None, 270 *args: Any, 271 **kwargs: Any, 272 ) -> None: 273 super().__init__(*args, **kwargs) 274 275 self.min_doc_len = min_doc_len 276 self.max_doc_len = max_doc_len 277 278 def apply(self, doc: Document) -> Document: 279 """ 280 >>> DocumentLengthFilter(min_doc_len=5).apply(Document("1234")).is_rejected 281 True 282 """ 283 doc_len = len(doc.text) 284 if self.min_doc_len is not None: 285 if doc_len < self.min_doc_len: 286 doc.is_rejected = True 287 if self.max_doc_len is not None: 288 if self.max_doc_len < doc_len: 289 doc.is_rejected = True 290 return doc
min_doc_len
, max_doc_len
で指定した上限・下限の範囲内にないドキュメントを破棄します.
デフォルトでは 200字 以上 50000字以内のテキストが受理されます.
266 def __init__( 267 self, 268 min_doc_len: Optional[int] = None, 269 max_doc_len: Optional[int] = None, 270 *args: Any, 271 **kwargs: Any, 272 ) -> None: 273 super().__init__(*args, **kwargs) 274 275 self.min_doc_len = min_doc_len 276 self.max_doc_len = max_doc_len
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p
is 1, the filter will always be applied.
skip_rejected : bool
If True
, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False
.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None
, a new random number generator will be created.
If None
, and use in the Compose
class, the random state is shared with the Compose
object.
use_batch : bool
If True
, the filter will process documents in batches in the apply_stream
method.
batch_size : int
The size of the batch to process documents in the apply_stream
method.
kwargs : Any
Additional keyword arguments to pass to the filter.
278 def apply(self, doc: Document) -> Document: 279 """ 280 >>> DocumentLengthFilter(min_doc_len=5).apply(Document("1234")).is_rejected 281 True 282 """ 283 doc_len = len(doc.text) 284 if self.min_doc_len is not None: 285 if doc_len < self.min_doc_len: 286 doc.is_rejected = True 287 if self.max_doc_len is not None: 288 if self.max_doc_len < doc_len: 289 doc.is_rejected = True 290 return doc
>>> DocumentLengthFilter(min_doc_len=5).apply(Document("1234")).is_rejected
True
293class NgWordsFilterJa(Filter): 294 """ 295 日本語のNGワード(および不適切語)を含む文書を破棄します. 296 `dict_path` で指定したファイルから, キーワードのリストを得ます. 297 ファイルは単語が改行で羅列されたテキストファイルです. 298 299 `ignore_confused` を `True` にすると, 300 偽陽性を軽減するために, カタカナのNGワードは前後にカタカナが無い場合のみNG判定されます. 301 デフォルト値は `False` です. 302 """ 303 304 def __init__( 305 self, 306 dict_path: Union[str, PathLike], 307 ignore_confused: bool = False, 308 *args: Any, 309 **kwargs: Any, 310 ) -> None: 311 super().__init__(*args, **kwargs) 312 313 with open(dict_path, encoding="utf-8") as fp: 314 ng_words = fp.read().split("\n") 315 ng_words = [w.strip() for w in ng_words if not len(w) == 0] 316 317 if ignore_confused: 318 words_katakana = [] 319 words_not_katakana = [] 320 for w in ng_words: 321 if re.fullmatch(r"[ァ-ヴー]+", w): 322 words_katakana.append(re.escape(w)) 323 else: 324 words_not_katakana.append(re.escape(w)) 325 katakana_pat = "|".join(words_katakana) 326 katakana_pat = rf"(?<![ァ-ヴー])({katakana_pat})(?![ァ-ヴー])" 327 pat = "|".join(words_not_katakana) + "|" + katakana_pat 328 self.keyword_pat = re.compile(pat) 329 else: 330 ng_words = [re.escape(w) for w in ng_words] 331 pat = "|".join(ng_words) 332 self.keyword_pat = re.compile(pat) 333 334 def apply(self, doc: Document) -> Document: 335 regex_match = self.keyword_pat.search(doc.text) 336 if regex_match: 337 doc.is_rejected = True 338 self.matched_text = regex_match.group() 339 self.matched_text_neighbor = doc.text[ 340 regex_match.start() - 20 : regex_match.end() + 20 341 ] 342 343 return doc
日本語のNGワード(および不適切語)を含む文書を破棄します.
dict_path
で指定したファイルから, キーワードのリストを得ます.
ファイルは単語が改行で羅列されたテキストファイルです.
ignore_confused
を True
にすると,
偽陽性を軽減するために, カタカナのNGワードは前後にカタカナが無い場合のみNG判定されます.
デフォルト値は False
です.
304 def __init__( 305 self, 306 dict_path: Union[str, PathLike], 307 ignore_confused: bool = False, 308 *args: Any, 309 **kwargs: Any, 310 ) -> None: 311 super().__init__(*args, **kwargs) 312 313 with open(dict_path, encoding="utf-8") as fp: 314 ng_words = fp.read().split("\n") 315 ng_words = [w.strip() for w in ng_words if not len(w) == 0] 316 317 if ignore_confused: 318 words_katakana = [] 319 words_not_katakana = [] 320 for w in ng_words: 321 if re.fullmatch(r"[ァ-ヴー]+", w): 322 words_katakana.append(re.escape(w)) 323 else: 324 words_not_katakana.append(re.escape(w)) 325 katakana_pat = "|".join(words_katakana) 326 katakana_pat = rf"(?<![ァ-ヴー])({katakana_pat})(?![ァ-ヴー])" 327 pat = "|".join(words_not_katakana) + "|" + katakana_pat 328 self.keyword_pat = re.compile(pat) 329 else: 330 ng_words = [re.escape(w) for w in ng_words] 331 pat = "|".join(ng_words) 332 self.keyword_pat = re.compile(pat)
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p
is 1, the filter will always be applied.
skip_rejected : bool
If True
, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False
.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None
, a new random number generator will be created.
If None
, and use in the Compose
class, the random state is shared with the Compose
object.
use_batch : bool
If True
, the filter will process documents in batches in the apply_stream
method.
batch_size : int
The size of the batch to process documents in the apply_stream
method.
kwargs : Any
Additional keyword arguments to pass to the filter.
334 def apply(self, doc: Document) -> Document: 335 regex_match = self.keyword_pat.search(doc.text) 336 if regex_match: 337 doc.is_rejected = True 338 self.matched_text = regex_match.group() 339 self.matched_text_neighbor = doc.text[ 340 regex_match.start() - 20 : regex_match.end() + 20 341 ] 342 343 return doc
Definition of filter behavior.
The document must have a protocol TextContent
,
and mostly used hojichar.Document class.
In this method, the filter will modify document.text
or
document.extras
and set document.is_rejected = True
to discard the document.
Parameters
document : Document Input document
Returns
Document Processed Document
346class NgWordsFilterEn(Filter): 347 """ 348 英語のNGワード(および不適切語)を含む文書を破棄します. 349 `dict_path` で指定したファイルから, キーワードのリストを得ます. 350 ファイルは単語が改行で羅列されたテキストファイルです. 351 """ 352 353 def __init__(self, dict_path: Union[str, PathLike], *args: Any, **kwargs: Any) -> None: 354 super().__init__(*args, **kwargs) 355 356 with open(dict_path, encoding="utf-8") as fp: 357 ng_words = fp.read().split("\n") 358 ng_words = [re.escape(w.strip()) for w in ng_words if not len(w) == 0] 359 pat = "|".join(ng_words) 360 # 英語のパターンにマッチするようにしている, \s[単語]\s や [単語]. [単語], などにマッチ. 361 self.keyword_pat = re.compile(rf"(?:^| )({pat})(?:( |,|\.)|$)", re.IGNORECASE) 362 363 def apply(self, doc: Document) -> Document: 364 if self.keyword_pat.search(doc.text): 365 doc.is_rejected = True 366 return doc
英語のNGワード(および不適切語)を含む文書を破棄します.
dict_path
で指定したファイルから, キーワードのリストを得ます.
ファイルは単語が改行で羅列されたテキストファイルです.
353 def __init__(self, dict_path: Union[str, PathLike], *args: Any, **kwargs: Any) -> None: 354 super().__init__(*args, **kwargs) 355 356 with open(dict_path, encoding="utf-8") as fp: 357 ng_words = fp.read().split("\n") 358 ng_words = [re.escape(w.strip()) for w in ng_words if not len(w) == 0] 359 pat = "|".join(ng_words) 360 # 英語のパターンにマッチするようにしている, \s[単語]\s や [単語]. [単語], などにマッチ. 361 self.keyword_pat = re.compile(rf"(?:^| )({pat})(?:( |,|\.)|$)", re.IGNORECASE)
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p
is 1, the filter will always be applied.
skip_rejected : bool
If True
, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False
.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None
, a new random number generator will be created.
If None
, and use in the Compose
class, the random state is shared with the Compose
object.
use_batch : bool
If True
, the filter will process documents in batches in the apply_stream
method.
batch_size : int
The size of the batch to process documents in the apply_stream
method.
kwargs : Any
Additional keyword arguments to pass to the filter.
363 def apply(self, doc: Document) -> Document: 364 if self.keyword_pat.search(doc.text): 365 doc.is_rejected = True 366 return doc
Definition of filter behavior.
The document must have a protocol TextContent
,
and mostly used hojichar.Document class.
In this method, the filter will modify document.text
or
document.extras
and set document.is_rejected = True
to discard the document.
Parameters
document : Document Input document
Returns
Document Processed Document
369class DiscardAdultContentJa(NgWordsFilterJa): 370 """ 371 日本語のアダルトキーワード(および不適切語)を含む文書を破棄します. 372 `dict_path` で指定したファイルから, キーワードのリストを得ます. 373 ファイルは単語が改行で羅列されたテキストファイルです. 374 デフォルトの`dict_path` は /hojichar/dict/adult_keywords_ja.txt です. 375 """ 376 377 def __init__( 378 self, 379 dict_path: Union[str, PathLike] = BASE_PATH / "dict/adult_keywords_ja.txt", 380 *args: Any, 381 **kwargs: Any, 382 ) -> None: 383 super().__init__(dict_path, *args, **kwargs) 384 385 def apply(self, doc: Document) -> Document: 386 """ 387 >>> DiscardAdultContentJa().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected 388 True 389 390 >>> DiscardAdultContentJa().apply(Document("ほうじ茶")).is_rejected 391 False 392 393 挙動は正しいが誤検知しているケース. 他にも, サック in リュックサック, 394 >>> DiscardAdultContentJa().apply(Document("アスパラガス")).is_rejected \ 395 # Matching with NG keyword "アス" 396 True 397 """ 398 return super().apply(doc)
日本語のアダルトキーワード(および不適切語)を含む文書を破棄します.
dict_path
で指定したファイルから, キーワードのリストを得ます.
ファイルは単語が改行で羅列されたテキストファイルです.
デフォルトのdict_path
は /hojichar/dict/adult_keywords_ja.txt です.
377 def __init__( 378 self, 379 dict_path: Union[str, PathLike] = BASE_PATH / "dict/adult_keywords_ja.txt", 380 *args: Any, 381 **kwargs: Any, 382 ) -> None: 383 super().__init__(dict_path, *args, **kwargs)
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p
is 1, the filter will always be applied.
skip_rejected : bool
If True
, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False
.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None
, a new random number generator will be created.
If None
, and use in the Compose
class, the random state is shared with the Compose
object.
use_batch : bool
If True
, the filter will process documents in batches in the apply_stream
method.
batch_size : int
The size of the batch to process documents in the apply_stream
method.
kwargs : Any
Additional keyword arguments to pass to the filter.
385 def apply(self, doc: Document) -> Document: 386 """ 387 >>> DiscardAdultContentJa().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected 388 True 389 390 >>> DiscardAdultContentJa().apply(Document("ほうじ茶")).is_rejected 391 False 392 393 挙動は正しいが誤検知しているケース. 他にも, サック in リュックサック, 394 >>> DiscardAdultContentJa().apply(Document("アスパラガス")).is_rejected \ 395 # Matching with NG keyword "アス" 396 True 397 """ 398 return super().apply(doc)
>>> DiscardAdultContentJa().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected
True
>>> DiscardAdultContentJa().apply(Document("ほうじ茶")).is_rejected
False
挙動は正しいが誤検知しているケース. 他にも, サック in リュックサック,
>>> DiscardAdultContentJa().apply(Document("アスパラガス")).is_rejected # Matching with NG keyword "アス"
True
401class DiscardAdultContentEn(NgWordsFilterEn): 402 """ 403 英語のアダルトキーワード(および不適切語)を含む文書を破棄します. 404 `dict_path` で指定したファイルから, キーワードのリストを得ます. 405 ファイルは単語が改行で羅列されたテキストファイルです. 406 デフォルトの`dict_path` は /hojichar/dict/adult_keywords_en.txt です. 407 """ 408 409 def __init__( 410 self, 411 dict_path: Union[str, PathLike] = BASE_PATH / "dict/adult_keywords_en.txt", 412 *args: Any, 413 **kwargs: Any, 414 ) -> None: 415 super().__init__(dict_path, *args, **kwargs) 416 417 def apply(self, doc: Document) -> Document: 418 """ 419 >>> DiscardAdultContentEn().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected 420 True 421 422 >>> DiscardAdultContentEn().apply(Document("hojichar")).is_rejected 423 False 424 """ 425 return super().apply(doc)
英語のアダルトキーワード(および不適切語)を含む文書を破棄します.
dict_path
で指定したファイルから, キーワードのリストを得ます.
ファイルは単語が改行で羅列されたテキストファイルです.
デフォルトのdict_path
は /hojichar/dict/adult_keywords_en.txt です.
409 def __init__( 410 self, 411 dict_path: Union[str, PathLike] = BASE_PATH / "dict/adult_keywords_en.txt", 412 *args: Any, 413 **kwargs: Any, 414 ) -> None: 415 super().__init__(dict_path, *args, **kwargs)
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p
is 1, the filter will always be applied.
skip_rejected : bool
If True
, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False
.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None
, a new random number generator will be created.
If None
, and use in the Compose
class, the random state is shared with the Compose
object.
use_batch : bool
If True
, the filter will process documents in batches in the apply_stream
method.
batch_size : int
The size of the batch to process documents in the apply_stream
method.
kwargs : Any
Additional keyword arguments to pass to the filter.
417 def apply(self, doc: Document) -> Document: 418 """ 419 >>> DiscardAdultContentEn().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected 420 True 421 422 >>> DiscardAdultContentEn().apply(Document("hojichar")).is_rejected 423 False 424 """ 425 return super().apply(doc)
>>> DiscardAdultContentEn().apply(Document("<TEST_STRING_OF_ADULT_KEYWORD>")).is_rejected
True
>>> DiscardAdultContentEn().apply(Document("hojichar")).is_rejected
False
428class DiscardDiscriminationContentJa(NgWordsFilterJa): 429 """ 430 日本語の差別キーワード(および不適切語)を含む文書を破棄します. 431 `dict_path` で指定したファイルから, キーワードのリストを得ます. 432 ファイルは単語が改行で羅列されたテキストファイルです. 433 デフォルトの`dict_path` は /hojichar/dict/discrimination_keywords_ja.txt です. 434 """ 435 436 def __init__( 437 self, 438 dict_path: Union[str, PathLike] = BASE_PATH / "dict/discrimination_keywords_ja.txt", 439 *args: Any, 440 **kwargs: Any, 441 ): 442 super().__init__(dict_path, *args, **kwargs) 443 444 def apply(self, doc: Document) -> Document: 445 """ 446 >>> DiscardDiscriminationContentJa().\ 447 apply(Document("<TEST_STRING_OF_DISCRIMINATION_KEYWORD>")).is_rejected 448 True 449 450 >>> DiscardDiscriminationContentJa().apply(Document("ほうじ茶")).is_rejected 451 False 452 """ 453 return super().apply(doc)
日本語の差別キーワード(および不適切語)を含む文書を破棄します.
dict_path
で指定したファイルから, キーワードのリストを得ます.
ファイルは単語が改行で羅列されたテキストファイルです.
デフォルトのdict_path
は /hojichar/dict/discrimination_keywords_ja.txt です.
436 def __init__( 437 self, 438 dict_path: Union[str, PathLike] = BASE_PATH / "dict/discrimination_keywords_ja.txt", 439 *args: Any, 440 **kwargs: Any, 441 ): 442 super().__init__(dict_path, *args, **kwargs)
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p
is 1, the filter will always be applied.
skip_rejected : bool
If True
, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False
.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None
, a new random number generator will be created.
If None
, and use in the Compose
class, the random state is shared with the Compose
object.
use_batch : bool
If True
, the filter will process documents in batches in the apply_stream
method.
batch_size : int
The size of the batch to process documents in the apply_stream
method.
kwargs : Any
Additional keyword arguments to pass to the filter.
444 def apply(self, doc: Document) -> Document: 445 """ 446 >>> DiscardDiscriminationContentJa().\ 447 apply(Document("<TEST_STRING_OF_DISCRIMINATION_KEYWORD>")).is_rejected 448 True 449 450 >>> DiscardDiscriminationContentJa().apply(Document("ほうじ茶")).is_rejected 451 False 452 """ 453 return super().apply(doc)
>>> DiscardDiscriminationContentJa(). apply(Document("<TEST_STRING_OF_DISCRIMINATION_KEYWORD>")).is_rejected
True
>>> DiscardDiscriminationContentJa().apply(Document("ほうじ茶")).is_rejected
False
456class DiscardViolenceContentJa(NgWordsFilterJa): 457 """ 458 日本語の暴力・脅迫を示唆するキーワードを含む文書を破棄します. 459 `dict_path` で指定したファイルから, キーワードのリストを得ます. 460 ファイルは単語が改行で羅列されたテキストファイルです. 461 デフォルトの`dict_path` は /hojichar/dict/violence_keywords_ja.txt です. 462 """ 463 464 def __init__( 465 self, 466 dict_path: Union[str, PathLike] = BASE_PATH / "dict/violence_keywords_ja.txt", 467 *args: Any, 468 **kwargs: Any, 469 ) -> None: 470 super().__init__(dict_path, *args, **kwargs) 471 472 def apply(self, doc: Document) -> Document: 473 """ 474 >>> DiscardViolenceContentJa()\ 475 .apply(Document("<TEST_STRING_OF_VIOLENCE_KEYWORD>")).is_rejected 476 True 477 478 >>> DiscardViolenceContentJa().apply(Document("ほうじ茶")).is_rejected 479 False 480 """ 481 return super().apply(doc)
日本語の暴力・脅迫を示唆するキーワードを含む文書を破棄します.
dict_path
で指定したファイルから, キーワードのリストを得ます.
ファイルは単語が改行で羅列されたテキストファイルです.
デフォルトのdict_path
は /hojichar/dict/violence_keywords_ja.txt です.
464 def __init__( 465 self, 466 dict_path: Union[str, PathLike] = BASE_PATH / "dict/violence_keywords_ja.txt", 467 *args: Any, 468 **kwargs: Any, 469 ) -> None: 470 super().__init__(dict_path, *args, **kwargs)
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p
is 1, the filter will always be applied.
skip_rejected : bool
If True
, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False
.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None
, a new random number generator will be created.
If None
, and use in the Compose
class, the random state is shared with the Compose
object.
use_batch : bool
If True
, the filter will process documents in batches in the apply_stream
method.
batch_size : int
The size of the batch to process documents in the apply_stream
method.
kwargs : Any
Additional keyword arguments to pass to the filter.
472 def apply(self, doc: Document) -> Document: 473 """ 474 >>> DiscardViolenceContentJa()\ 475 .apply(Document("<TEST_STRING_OF_VIOLENCE_KEYWORD>")).is_rejected 476 True 477 478 >>> DiscardViolenceContentJa().apply(Document("ほうじ茶")).is_rejected 479 False 480 """ 481 return super().apply(doc)
>>> DiscardViolenceContentJa() .apply(Document("<TEST_STRING_OF_VIOLENCE_KEYWORD>")).is_rejected
True
>>> DiscardViolenceContentJa().apply(Document("ほうじ茶")).is_rejected
False
484class DiscardBBSComments(Filter): 485 """ 486 正規表現 "BBS Pattern" に `max_allow_num` 回よりたくさんマッチする文書を破棄します. 487 `max_allow_num` のデフォルト値は14です. 488 正規表現 "BBS Pattern" は下記のリンクで検証可能です. 489 https://regex101.com/r/ybQvL2/1 490 """ 491 492 def __init__(self, max_allowed_num: int = 14, *args: Any, **kwargs: Any) -> None: 493 super().__init__(*args, **kwargs) 494 495 self.max_allowed_num = max_allowed_num 496 self.keyword_pat = re.compile( 497 r"\d{4}[年\.\-\/][\ ]*\d{1,2}[月\.\-\/][\ ]*\d{1,2}[日]*|コメント|SOLD OUT|レビュー|投稿|ページ|\([月火水木金土日]\)|質問|\d+話|楽天市場|-" # noqa 498 ) 499 500 def apply(self, doc: Document) -> Document: 501 """ 502 >>> DiscardBBSComments().apply(Document("楽天市場 質問 投稿 コメント レビュー "*3)).is_rejected 503 True 504 505 >>> DiscardBBSComments().apply(Document("鏡餅")).is_rejected 506 False 507 """ 508 bbs_factor = self.keyword_pat.findall(doc.text) 509 if len(bbs_factor) > self.max_allowed_num: 510 doc.is_rejected = True 511 return doc
正規表現 "BBS Pattern" に max_allow_num
回よりたくさんマッチする文書を破棄します.
max_allow_num
のデフォルト値は14です.
正規表現 "BBS Pattern" は下記のリンクで検証可能です.
https://regex101.com/r/ybQvL2/1
492 def __init__(self, max_allowed_num: int = 14, *args: Any, **kwargs: Any) -> None: 493 super().__init__(*args, **kwargs) 494 495 self.max_allowed_num = max_allowed_num 496 self.keyword_pat = re.compile( 497 r"\d{4}[年\.\-\/][\ ]*\d{1,2}[月\.\-\/][\ ]*\d{1,2}[日]*|コメント|SOLD OUT|レビュー|投稿|ページ|\([月火水木金土日]\)|質問|\d+話|楽天市場|-" # noqa 498 )
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p
is 1, the filter will always be applied.
skip_rejected : bool
If True
, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False
.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None
, a new random number generator will be created.
If None
, and use in the Compose
class, the random state is shared with the Compose
object.
use_batch : bool
If True
, the filter will process documents in batches in the apply_stream
method.
batch_size : int
The size of the batch to process documents in the apply_stream
method.
kwargs : Any
Additional keyword arguments to pass to the filter.
500 def apply(self, doc: Document) -> Document: 501 """ 502 >>> DiscardBBSComments().apply(Document("楽天市場 質問 投稿 コメント レビュー "*3)).is_rejected 503 True 504 505 >>> DiscardBBSComments().apply(Document("鏡餅")).is_rejected 506 False 507 """ 508 bbs_factor = self.keyword_pat.findall(doc.text) 509 if len(bbs_factor) > self.max_allowed_num: 510 doc.is_rejected = True 511 return doc
>>> DiscardBBSComments().apply(Document("楽天市場 質問 投稿 コメント レビュー "*3)).is_rejected
True
>>> DiscardBBSComments().apply(Document("鏡餅")).is_rejected
False
514class DiscardAds(Filter): 515 """ 516 主に広告キーワードを`max_allow_num`より多く含む文書を破棄します. 517 デフォルトで`max_allow_num` は14です. 518 `dict_path` で指定したファイルから, 広告キーワードのリストを得ます. 519 ファイルは単語が改行で羅列されたテキストファイルです. 520 デフォルトの`dict_path` は /hojichar/dict/advertisement_keywords_ja.txt です. 521 """ 522 523 def __init__( 524 self, 525 dict_path: Union[str, PathLike] = BASE_PATH / "dict/advertisement_keywords_ja.txt", 526 max_allowed_num: int = 14, 527 *args: Any, 528 **kwargs: Any, 529 ): 530 super().__init__(*args, **kwargs) 531 532 self.max_allow_num = max_allowed_num 533 with open(dict_path, encoding="utf-8") as fp: 534 ng_words = fp.read().split("\n") 535 ng_words = [re.escape(w.strip()) for w in ng_words if not len(w) == 0] 536 pat = r"|".join(ng_words) 537 self.keyword_pat = re.compile(pat) 538 539 def apply(self, doc: Document) -> Document: 540 """ 541 >>> DiscardAds().apply(Document("お問い合わせください 営業時間 よくある質問"*5)).is_rejected 542 True 543 544 >>> DiscardAds().apply(Document("おはよう")).is_rejected 545 False 546 """ 547 ads_factor = self.keyword_pat.findall(doc.text) 548 if len(ads_factor) > self.max_allow_num: 549 doc.is_rejected = True 550 return doc
主に広告キーワードをmax_allow_num
より多く含む文書を破棄します.
デフォルトでmax_allow_num
は14です.
dict_path
で指定したファイルから, 広告キーワードのリストを得ます.
ファイルは単語が改行で羅列されたテキストファイルです.
デフォルトのdict_path
は /hojichar/dict/advertisement_keywords_ja.txt です.
523 def __init__( 524 self, 525 dict_path: Union[str, PathLike] = BASE_PATH / "dict/advertisement_keywords_ja.txt", 526 max_allowed_num: int = 14, 527 *args: Any, 528 **kwargs: Any, 529 ): 530 super().__init__(*args, **kwargs) 531 532 self.max_allow_num = max_allowed_num 533 with open(dict_path, encoding="utf-8") as fp: 534 ng_words = fp.read().split("\n") 535 ng_words = [re.escape(w.strip()) for w in ng_words if not len(w) == 0] 536 pat = r"|".join(ng_words) 537 self.keyword_pat = re.compile(pat)
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p
is 1, the filter will always be applied.
skip_rejected : bool
If True
, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False
.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None
, a new random number generator will be created.
If None
, and use in the Compose
class, the random state is shared with the Compose
object.
use_batch : bool
If True
, the filter will process documents in batches in the apply_stream
method.
batch_size : int
The size of the batch to process documents in the apply_stream
method.
kwargs : Any
Additional keyword arguments to pass to the filter.
539 def apply(self, doc: Document) -> Document: 540 """ 541 >>> DiscardAds().apply(Document("お問い合わせください 営業時間 よくある質問"*5)).is_rejected 542 True 543 544 >>> DiscardAds().apply(Document("おはよう")).is_rejected 545 False 546 """ 547 ads_factor = self.keyword_pat.findall(doc.text) 548 if len(ads_factor) > self.max_allow_num: 549 doc.is_rejected = True 550 return doc
>>> DiscardAds().apply(Document("お問い合わせください 営業時間 よくある質問"*5)).is_rejected
True
>>> DiscardAds().apply(Document("おはよう")).is_rejected
False
553class AcceptJapanese(Filter): 554 """ 555 日本語でないドキュメントを破棄します. 日本語判定は次の手順で行われます. 556 1. テキストを左から`lookup_size` (デフォルトで50字) 参照し, 557 ひらがな・カタカナが存在すれば日本語と判定する. 558 """ 559 560 def __init__(self, lookup_size: int = 50, *args: Any, **kwargs: Any) -> None: 561 super().__init__(*args, **kwargs) 562 563 self.lookup_size = lookup_size 564 self.hiragana_katakana_pat = re.compile(r"[ぁ-んァ-ン]") 565 566 def apply(self, doc: Document) -> Document: 567 """ 568 >>> AcceptJapanese().apply(Document("This is English document")).is_rejected 569 True 570 571 >>> AcceptJapanese().apply(Document("a"*50 + "あ")).is_rejected 572 True 573 574 >>> AcceptJapanese().apply(Document("ほうじ茶")).is_rejected 575 False 576 """ 577 if not self.hiragana_katakana_pat.search(doc.text[: self.lookup_size]): 578 doc.is_rejected = True 579 return doc
日本語でないドキュメントを破棄します. 日本語判定は次の手順で行われます.
1. テキストを左からlookup_size
(デフォルトで50字) 参照し,
ひらがな・カタカナが存在すれば日本語と判定する.
560 def __init__(self, lookup_size: int = 50, *args: Any, **kwargs: Any) -> None: 561 super().__init__(*args, **kwargs) 562 563 self.lookup_size = lookup_size 564 self.hiragana_katakana_pat = re.compile(r"[ぁ-んァ-ン]")
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p
is 1, the filter will always be applied.
skip_rejected : bool
If True
, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False
.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None
, a new random number generator will be created.
If None
, and use in the Compose
class, the random state is shared with the Compose
object.
use_batch : bool
If True
, the filter will process documents in batches in the apply_stream
method.
batch_size : int
The size of the batch to process documents in the apply_stream
method.
kwargs : Any
Additional keyword arguments to pass to the filter.
566 def apply(self, doc: Document) -> Document: 567 """ 568 >>> AcceptJapanese().apply(Document("This is English document")).is_rejected 569 True 570 571 >>> AcceptJapanese().apply(Document("a"*50 + "あ")).is_rejected 572 True 573 574 >>> AcceptJapanese().apply(Document("ほうじ茶")).is_rejected 575 False 576 """ 577 if not self.hiragana_katakana_pat.search(doc.text[: self.lookup_size]): 578 doc.is_rejected = True 579 return doc
>>> AcceptJapanese().apply(Document("This is English document")).is_rejected
True
>>> AcceptJapanese().apply(Document("a"*50 + "あ")).is_rejected
True
>>> AcceptJapanese().apply(Document("ほうじ茶")).is_rejected
False
582class DiscardRareKuten(Filter): 583 """ 584 日本語でないドキュメントを破棄します. 日本語判定は次の手順で行われます 585 ドキュメントを句点"。"で区切り, 平均文長が 586 `max_avarage_sentence_length` より長い場合は破棄します. 587 `max_avarage_sentence_length` のデフォルト値は100です. 588 このフィルタは, 文章中の句点の割合が少なすぎるドキュメントを破棄します. 589 """ 590 591 def __init__(self, max_average_sentence_length: int = 100, *args: Any, **kwargs: Any) -> None: 592 super().__init__(*args, **kwargs) 593 594 self.max_average_sentence_length = max_average_sentence_length 595 self.kuten_pat = re.compile(r"。") 596 597 def apply(self, doc: Document) -> Document: 598 """ 599 >>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよ。")).is_rejected 600 False 601 >>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよう。")).is_rejected 602 True 603 """ 604 kuten_lst = self.kuten_pat.findall(doc.text) 605 min_kuten_num = len(doc.text) / self.max_average_sentence_length 606 if len(kuten_lst) < min_kuten_num: 607 doc.is_rejected = True 608 return doc
日本語でないドキュメントを破棄します. 日本語判定は次の手順で行われます
ドキュメントを句点"。"で区切り, 平均文長が
max_avarage_sentence_length
より長い場合は破棄します.
max_avarage_sentence_length
のデフォルト値は100です.
このフィルタは, 文章中の句点の割合が少なすぎるドキュメントを破棄します.
591 def __init__(self, max_average_sentence_length: int = 100, *args: Any, **kwargs: Any) -> None: 592 super().__init__(*args, **kwargs) 593 594 self.max_average_sentence_length = max_average_sentence_length 595 self.kuten_pat = re.compile(r"。")
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p
is 1, the filter will always be applied.
skip_rejected : bool
If True
, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False
.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None
, a new random number generator will be created.
If None
, and use in the Compose
class, the random state is shared with the Compose
object.
use_batch : bool
If True
, the filter will process documents in batches in the apply_stream
method.
batch_size : int
The size of the batch to process documents in the apply_stream
method.
kwargs : Any
Additional keyword arguments to pass to the filter.
597 def apply(self, doc: Document) -> Document: 598 """ 599 >>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよ。")).is_rejected 600 False 601 >>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよう。")).is_rejected 602 True 603 """ 604 kuten_lst = self.kuten_pat.findall(doc.text) 605 min_kuten_num = len(doc.text) / self.max_average_sentence_length 606 if len(kuten_lst) < min_kuten_num: 607 doc.is_rejected = True 608 return doc
>>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよ。")).is_rejected
False
>>> DiscardRareKuten(max_average_sentence_length=4).apply(Document("おはよう。")).is_rejected
True
673class MaskPersonalInformation(Filter): 674 """ 675 ドキュメントに含まれる電話番号・電子メールアドレスを一部マスキングします. 676 """ 677 678 def __init__(self, *args: Any, **kwargs: Any) -> None: 679 super().__init__(*args, **kwargs) 680 681 self.phone_pat = re.compile( 682 r"((0|\+\d{1,3}[- ]?)(\d{2}[- ]?\d{4}[- ]?|\d[- ]?\d{4}[- ]?|\d{2}[- ]?\d{3}[- ]?|\d{3}[- ]?\d{2}[- ]?|\d{4}[- ]?\d{1}[- ]?))\d{4}" # noqa 683 ) 684 self.email_pat = re.compile( 685 r"[a-zA-Z0-9!#$%&'*+\-/=?^_`{|}~.]+@[A-Za-z0-9!#$%&'*+\-/=?^_`{|}~.]+(\.[A-Za-z0-9\-]+)" # noqa 686 ) 687 688 def apply(self, doc: Document) -> Document: 689 """ 690 >>> MaskPersonalInformation()('06-1234-5678') 691 '06-1234-XXXX' 692 >>> MaskPersonalInformation()('075-123-4567') 693 '075-123-XXXX' 694 >>> MaskPersonalInformation()('0166-12-3456') 695 '0166-12-XXXX' 696 >>> MaskPersonalInformation()('09808-1-2345') 697 '09808-1-XXXX' 698 >>> MaskPersonalInformation()('090-1234-5678') 699 '090-1234-XXXX' 700 >>> MaskPersonalInformation()('0751234567') 701 '075123XXXX' 702 >>> MaskPersonalInformation()('08012345678') 703 '0801234XXXX' 704 >>> MaskPersonalInformation()('連絡は075-123-4567 まで') 705 '連絡は075-123-XXXX まで' 706 >>> MaskPersonalInformation()('+81-80-1234-5678') 707 '+81-80-1234-XXXX' 708 >>> MaskPersonalInformation()('+818012345678') 709 '+81801234XXXX' 710 >>> MaskPersonalInformation()('hogehoge@example.com') 711 'xxxx@yyy.com' 712 >>> MaskPersonalInformation()('何かあれば hogehoge@example.ne.jp まで連絡') 713 '何かあれば xxxx@yyy.jp まで連絡' 714 """ 715 text = self.phone_pat.sub(r"\1XXXX", doc.text) 716 text = self.email_pat.sub(r"xxxx@yyy\1", text) 717 doc.text = text 718 return doc
ドキュメントに含まれる電話番号・電子メールアドレスを一部マスキングします.
678 def __init__(self, *args: Any, **kwargs: Any) -> None: 679 super().__init__(*args, **kwargs) 680 681 self.phone_pat = re.compile( 682 r"((0|\+\d{1,3}[- ]?)(\d{2}[- ]?\d{4}[- ]?|\d[- ]?\d{4}[- ]?|\d{2}[- ]?\d{3}[- ]?|\d{3}[- ]?\d{2}[- ]?|\d{4}[- ]?\d{1}[- ]?))\d{4}" # noqa 683 ) 684 self.email_pat = re.compile( 685 r"[a-zA-Z0-9!#$%&'*+\-/=?^_`{|}~.]+@[A-Za-z0-9!#$%&'*+\-/=?^_`{|}~.]+(\.[A-Za-z0-9\-]+)" # noqa 686 )
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p
is 1, the filter will always be applied.
skip_rejected : bool
If True
, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False
.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None
, a new random number generator will be created.
If None
, and use in the Compose
class, the random state is shared with the Compose
object.
use_batch : bool
If True
, the filter will process documents in batches in the apply_stream
method.
batch_size : int
The size of the batch to process documents in the apply_stream
method.
kwargs : Any
Additional keyword arguments to pass to the filter.
688 def apply(self, doc: Document) -> Document: 689 """ 690 >>> MaskPersonalInformation()('06-1234-5678') 691 '06-1234-XXXX' 692 >>> MaskPersonalInformation()('075-123-4567') 693 '075-123-XXXX' 694 >>> MaskPersonalInformation()('0166-12-3456') 695 '0166-12-XXXX' 696 >>> MaskPersonalInformation()('09808-1-2345') 697 '09808-1-XXXX' 698 >>> MaskPersonalInformation()('090-1234-5678') 699 '090-1234-XXXX' 700 >>> MaskPersonalInformation()('0751234567') 701 '075123XXXX' 702 >>> MaskPersonalInformation()('08012345678') 703 '0801234XXXX' 704 >>> MaskPersonalInformation()('連絡は075-123-4567 まで') 705 '連絡は075-123-XXXX まで' 706 >>> MaskPersonalInformation()('+81-80-1234-5678') 707 '+81-80-1234-XXXX' 708 >>> MaskPersonalInformation()('+818012345678') 709 '+81801234XXXX' 710 >>> MaskPersonalInformation()('hogehoge@example.com') 711 'xxxx@yyy.com' 712 >>> MaskPersonalInformation()('何かあれば hogehoge@example.ne.jp まで連絡') 713 '何かあれば xxxx@yyy.jp まで連絡' 714 """ 715 text = self.phone_pat.sub(r"\1XXXX", doc.text) 716 text = self.email_pat.sub(r"xxxx@yyy\1", text) 717 doc.text = text 718 return doc
>>> MaskPersonalInformation()('06-1234-5678')
'06-1234-XXXX'
>>> MaskPersonalInformation()('075-123-4567')
'075-123-XXXX'
>>> MaskPersonalInformation()('0166-12-3456')
'0166-12-XXXX'
>>> MaskPersonalInformation()('09808-1-2345')
'09808-1-XXXX'
>>> MaskPersonalInformation()('090-1234-5678')
'090-1234-XXXX'
>>> MaskPersonalInformation()('0751234567')
'075123XXXX'
>>> MaskPersonalInformation()('08012345678')
'0801234XXXX'
>>> MaskPersonalInformation()('連絡は075-123-4567 まで')
'連絡は075-123-XXXX まで'
>>> MaskPersonalInformation()('+81-80-1234-5678')
'+81-80-1234-XXXX'
>>> MaskPersonalInformation()('+818012345678')
'+81801234XXXX'
>>> MaskPersonalInformation()('hogehoge@example.com')
'xxxx@yyy.com'
>>> MaskPersonalInformation()('何かあれば hogehoge@example.ne.jp まで連絡')
'何かあれば xxxx@yyy.jp まで連絡'
721class DiscardTooManyNouns(Filter): 722 """ 723 [!CAUTION] This filter requires `fugashi` package. Please install it 724 by `pip install 'hojichar[all]'`. 725 726 A filter that removes document with too many nouns in Japanese i.e., 727 documents such as advertisement, word salad, etc ... 728 """ 729 730 def __init__( 731 self, threshold: float = 0.80, max_parse_chars: int = 100_000, *args: Any, **kwargs: Any 732 ) -> None: 733 """ 734 Args: 735 threshold: document whose noun ratio is higher than this value will be discarded 736 max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document. 737 *args: 738 **kwargs: 739 """ 740 super().__init__(*args, **kwargs) 741 assert is_loaded_extras, ( 742 "fugashi is required for this filter. Try pip install 'hojichar[all]'" 743 ) 744 745 self.threshold = threshold 746 self.max_parse_chars = max_parse_chars 747 self.tagger = Tagger("-Owakati") 748 assert "unidic" in self.tagger.dictionary_info[0]["filename"], ( 749 "MeCab dictionary must be unidic" 750 ) 751 752 def _chunk_text(self, text: str) -> Iterable[str]: 753 """Slice text into chunks of `max_parse_chars` length.""" 754 step = self.max_parse_chars 755 for i in range(0, len(text), step): 756 yield text[i : i + step] 757 758 def apply(self, doc: Document) -> Document: 759 """ 760 >>> DiscardTooManyNouns().apply(Document("自然言語処理大好き!")).is_rejected 761 False 762 >>> DiscardTooManyNouns().apply(Document("リンゴ・オレンジ・ミカン・バナナ セール中")).is_rejected 763 True 764 >>> DiscardTooManyNouns().apply(Document("今日の仙台朝市ではリンゴがセール中")).is_rejected 765 False 766 """ 767 # remove "補助記号" from part-of-speech statistics 768 # because they often decrease the noun ratio, 769 # e.g., the sentence "リンゴ・オレンジ・バナナ・" has 補助記号 ratio of 0.5 770 # however, we don't want such sentence 771 772 pos_count: Counter[str] = Counter() 773 for chunk in self._chunk_text(doc.text): 774 for word in self.tagger(chunk): 775 if word.feature.pos1 != "補助記号": 776 pos_count[word.feature.pos1] += 1 777 778 try: 779 noun_ratio = pos_count["名詞"] / sum(pos_count.values()) 780 except ZeroDivisionError: 781 noun_ratio = 0.0 782 if noun_ratio >= self.threshold: 783 doc.is_rejected = True 784 return doc
[!CAUTION] This filter requires fugashi
package. Please install it
by pip install 'hojichar[all]'
.
A filter that removes document with too many nouns in Japanese i.e., documents such as advertisement, word salad, etc ...
730 def __init__( 731 self, threshold: float = 0.80, max_parse_chars: int = 100_000, *args: Any, **kwargs: Any 732 ) -> None: 733 """ 734 Args: 735 threshold: document whose noun ratio is higher than this value will be discarded 736 max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document. 737 *args: 738 **kwargs: 739 """ 740 super().__init__(*args, **kwargs) 741 assert is_loaded_extras, ( 742 "fugashi is required for this filter. Try pip install 'hojichar[all]'" 743 ) 744 745 self.threshold = threshold 746 self.max_parse_chars = max_parse_chars 747 self.tagger = Tagger("-Owakati") 748 assert "unidic" in self.tagger.dictionary_info[0]["filename"], ( 749 "MeCab dictionary must be unidic" 750 )
Args: threshold: document whose noun ratio is higher than this value will be discarded max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document. args: *kwargs:
758 def apply(self, doc: Document) -> Document: 759 """ 760 >>> DiscardTooManyNouns().apply(Document("自然言語処理大好き!")).is_rejected 761 False 762 >>> DiscardTooManyNouns().apply(Document("リンゴ・オレンジ・ミカン・バナナ セール中")).is_rejected 763 True 764 >>> DiscardTooManyNouns().apply(Document("今日の仙台朝市ではリンゴがセール中")).is_rejected 765 False 766 """ 767 # remove "補助記号" from part-of-speech statistics 768 # because they often decrease the noun ratio, 769 # e.g., the sentence "リンゴ・オレンジ・バナナ・" has 補助記号 ratio of 0.5 770 # however, we don't want such sentence 771 772 pos_count: Counter[str] = Counter() 773 for chunk in self._chunk_text(doc.text): 774 for word in self.tagger(chunk): 775 if word.feature.pos1 != "補助記号": 776 pos_count[word.feature.pos1] += 1 777 778 try: 779 noun_ratio = pos_count["名詞"] / sum(pos_count.values()) 780 except ZeroDivisionError: 781 noun_ratio = 0.0 782 if noun_ratio >= self.threshold: 783 doc.is_rejected = True 784 return doc
>>> DiscardTooManyNouns().apply(Document("自然言語処理大好き!")).is_rejected
False
>>> DiscardTooManyNouns().apply(Document("リンゴ・オレンジ・ミカン・バナナ セール中")).is_rejected
True
>>> DiscardTooManyNouns().apply(Document("今日の仙台朝市ではリンゴがセール中")).is_rejected
False
787class CharRepetitionRatioFilter(Filter): 788 """ 789 文字Ngramの重なり率(文書中で高頻度文字Ngramが占める割合)を計算して, 重なりの大きいものを除去します. 790 名詞の連続からなるような広告テキストを取り除くのに有効です. 791 792 実装は, BigScience で採用されていた前処理を参考にしています. 793 元実装: https://github.com/bigscience-workshop/data-preparation/blob/9d0588419073cc5bf0fb92b58f37f2a1016572c3/preprocessing/training/01b_oscar_cleaning_and_filtering/filtering.py#L425-L453 # noqa: E501 794 795 「高頻度文字Ngram」は、sqrt(ユニークなNgramの総数)によって求めていますが, 796 これは文書長の影響を軽減するためだとされています. 797 798 掲示板のテキストが引っかかりやすい傾向があります. 799 13: 名無しさん@実況で競馬板アウト 2019/08/18(日) 15:28:46.10 ID:eBvZg8h+0 800 的なものが高頻度で登場するため、文字Ngramの重なり率も高くなってしまう 801 """ 802 803 def __init__( 804 self, threshold: float = 0.33, ngram_size: int = 5, *args: Any, **kwargs: Any 805 ) -> None: 806 """ 807 808 Args: 809 threshold: document with character repetition ratio higher than this value will be discarded 810 ngram_size: character ngram size. Larger value will decrease the false positive of long documents 811 *args: 812 **kwargs: 813 """ # noqa: E501 814 815 super().__init__(*args, **kwargs) 816 self.threshold = threshold 817 self.ngram_size = ngram_size 818 819 def apply(self, doc: Document) -> Document: 820 ratio = self.compute_character_repetition_ratio(doc.text, self.ngram_size) 821 if ratio >= self.threshold: 822 doc.is_rejected = True 823 return doc 824 825 @staticmethod 826 def compute_character_repetition_ratio( 827 document: str, character_repetition_length: int 828 ) -> float: 829 def get_freq_character_ngrams(document: str, n: int) -> Dict[str, int]: 830 character_ngrams: List[str] = [ 831 document[i : i + n] for i in range(len(document) - n + 1) 832 ] 833 freq_character_ngrams_dict: Dict[str, int] = {} 834 for character_ngram in character_ngrams: 835 freq_character_ngrams_dict[character_ngram] = ( 836 freq_character_ngrams_dict.get(character_ngram, 0) + 1 837 ) 838 return freq_character_ngrams_dict 839 840 freq_character_ngrams_dict = get_freq_character_ngrams( 841 document, character_repetition_length 842 ) 843 if len(freq_character_ngrams_dict) == 0: 844 return 0.0 845 freq_character_ngrams: List[int] = list(freq_character_ngrams_dict.values()) 846 freq_character_ngrams = sorted(freq_character_ngrams, reverse=True) 847 val_one = len([el for el in freq_character_ngrams if el == 1]) 848 num_rep_character_ngrams = min( 849 int(np.sqrt(len(freq_character_ngrams))), 850 len(freq_character_ngrams) - val_one, 851 ) 852 character_repetition_ratio = sum(freq_character_ngrams[:num_rep_character_ngrams]) / sum( 853 freq_character_ngrams 854 ) 855 return character_repetition_ratio
文字Ngramの重なり率(文書中で高頻度文字Ngramが占める割合)を計算して, 重なりの大きいものを除去します. 名詞の連続からなるような広告テキストを取り除くのに有効です.
実装は, BigScience で採用されていた前処理を参考にしています. 元実装: https://github.com/bigscience-workshop/data-preparation/blob/9d0588419073cc5bf0fb92b58f37f2a1016572c3/preprocessing/training/01b_oscar_cleaning_and_filtering/filtering.py#L425-L453 # noqa: E501
「高頻度文字Ngram」は、sqrt(ユニークなNgramの総数)によって求めていますが, これは文書長の影響を軽減するためだとされています.
掲示板のテキストが引っかかりやすい傾向があります. 13: 名無しさん@実況で競馬板アウト 2019/08/18(日) 15:28:46.10 ID:eBvZg8h+0 的なものが高頻度で登場するため、文字Ngramの重なり率も高くなってしまう
803 def __init__( 804 self, threshold: float = 0.33, ngram_size: int = 5, *args: Any, **kwargs: Any 805 ) -> None: 806 """ 807 808 Args: 809 threshold: document with character repetition ratio higher than this value will be discarded 810 ngram_size: character ngram size. Larger value will decrease the false positive of long documents 811 *args: 812 **kwargs: 813 """ # noqa: E501 814 815 super().__init__(*args, **kwargs) 816 self.threshold = threshold 817 self.ngram_size = ngram_size
Args: threshold: document with character repetition ratio higher than this value will be discarded ngram_size: character ngram size. Larger value will decrease the false positive of long documents args: *kwargs:
819 def apply(self, doc: Document) -> Document: 820 ratio = self.compute_character_repetition_ratio(doc.text, self.ngram_size) 821 if ratio >= self.threshold: 822 doc.is_rejected = True 823 return doc
Definition of filter behavior.
The document must have a protocol TextContent
,
and mostly used hojichar.Document class.
In this method, the filter will modify document.text
or
document.extras
and set document.is_rejected = True
to discard the document.
Parameters
document : Document Input document
Returns
Document Processed Document
825 @staticmethod 826 def compute_character_repetition_ratio( 827 document: str, character_repetition_length: int 828 ) -> float: 829 def get_freq_character_ngrams(document: str, n: int) -> Dict[str, int]: 830 character_ngrams: List[str] = [ 831 document[i : i + n] for i in range(len(document) - n + 1) 832 ] 833 freq_character_ngrams_dict: Dict[str, int] = {} 834 for character_ngram in character_ngrams: 835 freq_character_ngrams_dict[character_ngram] = ( 836 freq_character_ngrams_dict.get(character_ngram, 0) + 1 837 ) 838 return freq_character_ngrams_dict 839 840 freq_character_ngrams_dict = get_freq_character_ngrams( 841 document, character_repetition_length 842 ) 843 if len(freq_character_ngrams_dict) == 0: 844 return 0.0 845 freq_character_ngrams: List[int] = list(freq_character_ngrams_dict.values()) 846 freq_character_ngrams = sorted(freq_character_ngrams, reverse=True) 847 val_one = len([el for el in freq_character_ngrams if el == 1]) 848 num_rep_character_ngrams = min( 849 int(np.sqrt(len(freq_character_ngrams))), 850 len(freq_character_ngrams) - val_one, 851 ) 852 character_repetition_ratio = sum(freq_character_ngrams[:num_rep_character_ngrams]) / sum( 853 freq_character_ngrams 854 ) 855 return character_repetition_ratio
858class WordRepetitionRatioFilter(Filter): 859 """ 860 [!CAUTION] This filter requires `fugashi` package. Please install it 861 by `pip install 'hojichar[all]'`. 862 863 単語Ngramの重なり率(文書中で重複する単語Ngramが占める割合)を計算して、重なりの大きいものを弾くためのフィルタ. 864 BigScienceで採用されていた前処理を参考にしている. 865 866 名詞が連打されているような広告テキストを取り除くのに有効な様子 867 まともな文書がたまたま2回繰り返されている場合もあり、これを取り除いて良いのかは分からない 868 例: 869 "ウェブ\n本文: ニコンの上昇率16%超える、今3月期は経常76%の大幅増益見込む(ニコン) 2013年05月10日[minkabu PRESS] - みんなの株式 (みんかぶ)\n2013/05/10(10:57) 870 ニコン<7731.T>が急騰、寄り付き直後に前日比355円高の2537円まで買い上げ 871 られ、上昇率は16%を超えた。外国為替市場で円が1ドル100円台、1ユーロ131円台に入るなど急速に円安が進み、輸出株が軒並み高になる 872 なか、9日取引終了後に発表した前年3月期決算で、今3月期は2ケタ近い増収で大幅増益を見込んだことが買い気を強めさせた。連結売上 873 高は前期比9.8%増の1兆1100億円、経常利益75.8%増の850億円を予想。前期は半導体、電子部品の低迷が足かせになり、2ケタ増収ながら 874 経常46%の大幅減益になったが、レンズ交換式デジタルカメラの拡大や液晶ディスプレイの回復で収益が急回復する。ニコンの株価は10時 875 56分現在2491円(△309円)出所:株経通信(株式会社みんかぶ)\n2013/05/10 - ニコン(7731) の関連ニュース。 ニコン<7731.T>が急騰、寄 876 り付き直後に前日比355円高の2537円まで買い上げられ、上昇率は16%を超えた。外国為替市場で円が1ドル100円台、1ユーロ131円台に入 877 るなど急速に円安が進み、輸出株が軒並み高になるなか、9日取引終了後に発表した前年3月期決算で、今3月期は2ケタ近い増収で大幅増 878 益を見込んだことが買い気を強めさせた。連結売上高は前期比9.8%増の1兆1100億円、経常利益75.8%増の850億円を予想。前期は半導体、 879 電子部品の低迷が足かせになり、2ケタ増収ながら経常46%の大幅減益になったが、レンズ交換式デジタルカメラの拡大や液晶ディスプレ 880 イの回復で収益が急回" 881 """ # noqa: E501 882 883 def __init__( 884 self, 885 threshold: float = 0.40, 886 ngram_size: int = 7, 887 max_parse_chars: int = 100_000, 888 *args: Any, 889 **kwargs: Any, 890 ) -> None: 891 """ 892 893 Args: 894 threshold: document whose character repetition ratio is higher than this value will be discarded 895 ngram_size: character ngram size. Larger value will decrease the false positive of long documents 896 max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document. 897 *args: 898 **kwargs: 899 """ # noqa: E501 900 super().__init__(*args, **kwargs) 901 assert is_loaded_extras, ( 902 "fugashi is required for this filter. Try pip install 'hojichar[all]'" 903 ) 904 905 self.threshold = threshold 906 self.ngram_size = ngram_size 907 self.max_parse_chars = max_parse_chars 908 self.tagger = Tagger("-Owakati") 909 910 def _chunk_text(self, text: str) -> Iterable[str]: 911 """Split text into chunks of `max_parse_chars` length.""" 912 step = self.max_parse_chars 913 for i in range(0, len(text), step): 914 yield text[i : i + step] 915 916 def _get_freq_word_ngrams(self, words: List[str], n: int) -> Dict[str, int]: 917 freq: Dict[str, int] = {} 918 if n <= 0 or len(words) < n: 919 return freq 920 for i in range(len(words) - n + 1): 921 key = " ".join(words[i : i + n]) 922 freq[key] = freq.get(key, 0) + 1 923 return freq 924 925 def apply(self, doc: Document) -> Document: 926 ratio = self.compute_word_repetition_ratio(doc.text, self.ngram_size) 927 if ratio >= self.threshold: 928 doc.is_rejected = True 929 return doc 930 931 def compute_word_repetition_ratio(self, document: str, n: int) -> float: 932 total_counter: Counter[str] = Counter() 933 934 for chunk in self._chunk_text(document): 935 words = [w.surface for w in self.tagger(chunk)] 936 total_counter.update(self._get_freq_word_ngrams(words, n)) 937 938 if not total_counter: 939 return 0.0 940 941 total = sum(total_counter.values()) 942 repeated = sum(v for v in total_counter.values() if v > 1) 943 return repeated / total
[!CAUTION] This filter requires fugashi
package. Please install it
by pip install 'hojichar[all]'
.
単語Ngramの重なり率(文書中で重複する単語Ngramが占める割合)を計算して、重なりの大きいものを弾くためのフィルタ.
BigScienceで採用されていた前処理を参考にしている.
名詞が連打されているような広告テキストを取り除くのに有効な様子
まともな文書がたまたま2回繰り返されている場合もあり、これを取り除いて良いのかは分からない
例:
"ウェブ
本文: ニコンの上昇率16%超える、今3月期は経常76%の大幅増益見込む(ニコン) 2013年05月10日[minkabu PRESS] - みんなの株式 (みんかぶ) 2013/05/10(10:57) ニコン<7731.T>が急騰、寄り付き直後に前日比355円高の2537円まで買い上げ られ、上昇率は16%を超えた。外国為替市場で円が1ドル100円台、1ユーロ131円台に入るなど急速に円安が進み、輸出株が軒並み高になる なか、9日取引終了後に発表した前年3月期決算で、今3月期は2ケタ近い増収で大幅増益を見込んだことが買い気を強めさせた。連結売上 高は前期比9.8%増の1兆1100億円、経常利益75.8%増の850億円を予想。前期は半導体、電子部品の低迷が足かせになり、2ケタ増収ながら 経常46%の大幅減益になったが、レンズ交換式デジタルカメラの拡大や液晶ディスプレイの回復で収益が急回復する。ニコンの株価は10時 56分現在2491円(△309円)出所:株経通信(株式会社みんかぶ) 2013/05/10 - ニコン(7731) の関連ニュース。 ニコン<7731.T>が急騰、寄 り付き直後に前日比355円高の2537円まで買い上げられ、上昇率は16%を超えた。外国為替市場で円が1ドル100円台、1ユーロ131円台に入 るなど急速に円安が進み、輸出株が軒並み高になるなか、9日取引終了後に発表した前年3月期決算で、今3月期は2ケタ近い増収で大幅増 益を見込んだことが買い気を強めさせた。連結売上高は前期比9.8%増の1兆1100億円、経常利益75.8%増の850億円を予想。前期は半導体、 電子部品の低迷が足かせになり、2ケタ増収ながら経常46%の大幅減益になったが、レンズ交換式デジタルカメラの拡大や液晶ディスプレ イの回復で収益が急回"
883 def __init__( 884 self, 885 threshold: float = 0.40, 886 ngram_size: int = 7, 887 max_parse_chars: int = 100_000, 888 *args: Any, 889 **kwargs: Any, 890 ) -> None: 891 """ 892 893 Args: 894 threshold: document whose character repetition ratio is higher than this value will be discarded 895 ngram_size: character ngram size. Larger value will decrease the false positive of long documents 896 max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document. 897 *args: 898 **kwargs: 899 """ # noqa: E501 900 super().__init__(*args, **kwargs) 901 assert is_loaded_extras, ( 902 "fugashi is required for this filter. Try pip install 'hojichar[all]'" 903 ) 904 905 self.threshold = threshold 906 self.ngram_size = ngram_size 907 self.max_parse_chars = max_parse_chars 908 self.tagger = Tagger("-Owakati")
Args: threshold: document whose character repetition ratio is higher than this value will be discarded ngram_size: character ngram size. Larger value will decrease the false positive of long documents max_parse_chars: maximum number of characters to parse in the document. Too large value may cause segmentation fault parsing the document. args: *kwargs:
925 def apply(self, doc: Document) -> Document: 926 ratio = self.compute_word_repetition_ratio(doc.text, self.ngram_size) 927 if ratio >= self.threshold: 928 doc.is_rejected = True 929 return doc
Definition of filter behavior.
The document must have a protocol TextContent
,
and mostly used hojichar.Document class.
In this method, the filter will modify document.text
or
document.extras
and set document.is_rejected = True
to discard the document.
Parameters
document : Document Input document
Returns
Document Processed Document
931 def compute_word_repetition_ratio(self, document: str, n: int) -> float: 932 total_counter: Counter[str] = Counter() 933 934 for chunk in self._chunk_text(document): 935 words = [w.surface for w in self.tagger(chunk)] 936 total_counter.update(self._get_freq_word_ngrams(words, n)) 937 938 if not total_counter: 939 return 0.0 940 941 total = sum(total_counter.values()) 942 repeated = sum(v for v in total_counter.values() if v > 1) 943 return repeated / total
946class DiscardTooManySpecialToken(Filter): 947 """ 948 [!CAUTION] This filter requires `emoji` package. Please install it 949 by `pip install 'hojichar[all]'`. 950 951 句読点を含む記号、空白、絵文字、その他特殊な文字を一定の割合以上含むような文書を取り除くためのフィルタ 952 元実装: BigScience https://github.com/bigscience-workshop/data-preparation/blob/9d0588419073cc5bf0fb92b58f37f2a1016572c3/preprocessing/training/01b_oscar_cleaning_and_filtering/parameters_filtering.py#L5-L16 # noqa: E501 953 """ 954 955 def __init__(self, threshold: float = 0.4, *args: Any, **kwargs: Any) -> None: 956 """ 957 958 Args: 959 threshold: document whose special token ratio is higher than this value will be discarded 960 *args: 961 **kwargs: 962 """ # noqa: E501 963 super().__init__(*args, **kwargs) 964 965 # digits are not regarded as special tokens 966 # otherwise many false positives are made, i.e., good documents discarded 967 main_special_characters = string.punctuation + string.whitespace # + string.digits 968 other_special_characters = ( 969 " ’“”–▬…✦�£•€«»°·═" 970 "×士^˘⇓()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰ ‑≤≥‖" 971 "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†:⁄♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚" 972 "゜ʼ≖ʼ¤℃√!?【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖" 973 "」﴾》�" 974 ) 975 976 en_emoji = emoji.EMOJI_DATA.keys() 977 978 special_characters_default = set(main_special_characters + other_special_characters) 979 special_characters_default.update(en_emoji) 980 self.special_characters = special_characters_default 981 982 self.threshold = threshold 983 984 def _compute_special_characters_ratio(self, text: str) -> float: 985 if len(text) == 0: 986 return 0 987 988 special_characters_ratio = len( 989 [char for char in text if char in self.special_characters] 990 ) / len(text) 991 return special_characters_ratio 992 993 def apply(self, doc: Document) -> Document: 994 special_characters_ratio = self._compute_special_characters_ratio(doc.text) 995 996 if special_characters_ratio > self.threshold: 997 doc.is_rejected = True 998 return doc
[!CAUTION] This filter requires emoji
package. Please install it
by pip install 'hojichar[all]'
.
句読点を含む記号、空白、絵文字、その他特殊な文字を一定の割合以上含むような文書を取り除くためのフィルタ 元実装: BigScience https://github.com/bigscience-workshop/data-preparation/blob/9d0588419073cc5bf0fb92b58f37f2a1016572c3/preprocessing/training/01b_oscar_cleaning_and_filtering/parameters_filtering.py#L5-L16 # noqa: E501
955 def __init__(self, threshold: float = 0.4, *args: Any, **kwargs: Any) -> None: 956 """ 957 958 Args: 959 threshold: document whose special token ratio is higher than this value will be discarded 960 *args: 961 **kwargs: 962 """ # noqa: E501 963 super().__init__(*args, **kwargs) 964 965 # digits are not regarded as special tokens 966 # otherwise many false positives are made, i.e., good documents discarded 967 main_special_characters = string.punctuation + string.whitespace # + string.digits 968 other_special_characters = ( 969 " ’“”–▬…✦�£•€«»°·═" 970 "×士^˘⇓()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰ ‑≤≥‖" 971 "◆●■►▼▲▴∆▻¡★☆✱ːº。¯˜¥ɪ≈†:⁄♡✓⊕․.⋅÷1‟;،、¨ाাी्े◦˚" 972 "゜ʼ≖ʼ¤℃√!?【】‿∞➤~πه۩☛₨➩☻๑٪♥ıॽ《‘©﴿٬?▷Г♫∟™ª₪®「—❖" 973 "」﴾》�" 974 ) 975 976 en_emoji = emoji.EMOJI_DATA.keys() 977 978 special_characters_default = set(main_special_characters + other_special_characters) 979 special_characters_default.update(en_emoji) 980 self.special_characters = special_characters_default 981 982 self.threshold = threshold
Args: threshold: document whose special token ratio is higher than this value will be discarded args: *kwargs:
993 def apply(self, doc: Document) -> Document: 994 special_characters_ratio = self._compute_special_characters_ratio(doc.text) 995 996 if special_characters_ratio > self.threshold: 997 doc.is_rejected = True 998 return doc
Definition of filter behavior.
The document must have a protocol TextContent
,
and mostly used hojichar.Document class.
In this method, the filter will modify document.text
or
document.extras
and set document.is_rejected = True
to discard the document.
Parameters
document : Document Input document
Returns
Document Processed Document
1001class SingleCharacterRepetitionFilter(Filter): 1002 """ 1003 単一文字が大量に繰り返されているような文書を取り除くためのフィルタ 1004 そのような文書はノイズである可能性が高いため 1005 参考: BigScienceプロジェクトによると、oscarデータセットの中にバックスラッシュだけを2M個含むような文書が含まれていたらしい 1006 https://github.com/bigscience-workshop/bigscience/blob/master/train/tr8-104B-wide/chronicles.md#2m-backslash-only-samples-in-our-dataset # noqa: E501 1007 """ 1008 1009 def __init__( 1010 self, 1011 threshold: int = 200, 1012 *args: Any, 1013 **kwargs: Any, 1014 ) -> None: 1015 """ 1016 Args: 1017 threshold: The document is removed if character is repeated for this value or more 1018 *args: 1019 **kwargs: 1020 """ 1021 super().__init__(*args, **kwargs) 1022 self.threshold = threshold 1023 1024 def _is_repeat_contained(self, text: str) -> bool: 1025 groups = groupby(text) 1026 is_repeat_contained = any(sum(1 for _ in group) >= self.threshold for _, group in groups) 1027 return is_repeat_contained 1028 1029 def apply(self, doc: Document) -> Document: 1030 if self._is_repeat_contained(doc.text): 1031 doc.is_rejected = True 1032 return doc
単一文字が大量に繰り返されているような文書を取り除くためのフィルタ そのような文書はノイズである可能性が高いため 参考: BigScienceプロジェクトによると、oscarデータセットの中にバックスラッシュだけを2M個含むような文書が含まれていたらしい https://github.com/bigscience-workshop/bigscience/blob/master/train/tr8-104B-wide/chronicles.md#2m-backslash-only-samples-in-our-dataset # noqa: E501
1009 def __init__( 1010 self, 1011 threshold: int = 200, 1012 *args: Any, 1013 **kwargs: Any, 1014 ) -> None: 1015 """ 1016 Args: 1017 threshold: The document is removed if character is repeated for this value or more 1018 *args: 1019 **kwargs: 1020 """ 1021 super().__init__(*args, **kwargs) 1022 self.threshold = threshold
Args: threshold: The document is removed if character is repeated for this value or more args: *kwargs:
1029 def apply(self, doc: Document) -> Document: 1030 if self._is_repeat_contained(doc.text): 1031 doc.is_rejected = True 1032 return doc
Definition of filter behavior.
The document must have a protocol TextContent
,
and mostly used hojichar.Document class.
In this method, the filter will modify document.text
or
document.extras
and set document.is_rejected = True
to discard the document.
Parameters
document : Document Input document
Returns
Document Processed Document
1035class DiscardTooManyEndingEllipsis(Filter): 1036 """ 1037 ellipsisで終わるような行が大量に含まれるような文書を取り除くためのフィルタです. 1038 ellipsisとしては ... と … を用いている 1039 同様のフィルタが RedPajama v2で用いられています. 1040 1041 例として, 以下のような文書を検知します. 1042 ``` 1043 ペアーズは女性、という驚愕の過食が出ているのをごアラサーですか。時代から付... 1044 バツイチアラフォー 婚活ち女性の特徴と子持な付... 1045 ``` 1046 1047 デフォルトではしきい値を0.7としているが, これはC4から0.1%を削るような設定であり、 1048 precisionを重視した設定です. 1049 """ 1050 1051 def __init__( 1052 self, 1053 threshold: float = 0.7, 1054 *args: Any, 1055 **kwargs: Any, 1056 ) -> None: 1057 """ 1058 Args: 1059 threshold: The document is removed if ratio of lines ending with ellipsis is higher than this value 1060 *args: 1061 **kwargs: 1062 """ # noqa: E501 1063 super().__init__(*args, **kwargs) 1064 self.threshold = threshold 1065 self.ellipsis_pattern = re.compile(r"(\.{3}|…)\n") # matches ...\n and …\n 1066 1067 def apply(self, doc: Document) -> Document: 1068 ellipsis_count = len(self.ellipsis_pattern.findall(doc.text)) 1069 newline_count = max(doc.text.count("\n"), 1) # avoid zero division 1070 ellipsis_ratio = ellipsis_count / newline_count 1071 1072 if ellipsis_ratio > self.threshold: 1073 doc.is_rejected = True 1074 return doc
ellipsisで終わるような行が大量に含まれるような文書を取り除くためのフィルタです. ellipsisとしては ... と … を用いている 同様のフィルタが RedPajama v2で用いられています.
例として, 以下のような文書を検知します.
ペアーズは女性、という驚愕の過食が出ているのをごアラサーですか。時代から付...
バツイチアラフォー 婚活ち女性の特徴と子持な付...
デフォルトではしきい値を0.7としているが, これはC4から0.1%を削るような設定であり、 precisionを重視した設定です.
1051 def __init__( 1052 self, 1053 threshold: float = 0.7, 1054 *args: Any, 1055 **kwargs: Any, 1056 ) -> None: 1057 """ 1058 Args: 1059 threshold: The document is removed if ratio of lines ending with ellipsis is higher than this value 1060 *args: 1061 **kwargs: 1062 """ # noqa: E501 1063 super().__init__(*args, **kwargs) 1064 self.threshold = threshold 1065 self.ellipsis_pattern = re.compile(r"(\.{3}|…)\n") # matches ...\n and …\n
Args: threshold: The document is removed if ratio of lines ending with ellipsis is higher than this value args: *kwargs:
1067 def apply(self, doc: Document) -> Document: 1068 ellipsis_count = len(self.ellipsis_pattern.findall(doc.text)) 1069 newline_count = max(doc.text.count("\n"), 1) # avoid zero division 1070 ellipsis_ratio = ellipsis_count / newline_count 1071 1072 if ellipsis_ratio > self.threshold: 1073 doc.is_rejected = True 1074 return doc
Definition of filter behavior.
The document must have a protocol TextContent
,
and mostly used hojichar.Document class.
In this method, the filter will modify document.text
or
document.extras
and set document.is_rejected = True
to discard the document.
Parameters
document : Document Input document
Returns
Document Processed Document
1077class DiscardTooShortLines(Filter): 1078 """ 1079 短い行を大量に含む文書を捨てるためのフィルタです. 1080 1081 メニューバーやパンくずリストのような要素を大量に含む文書を取り除くのに有効です. 1082 """ 1083 1084 def __init__(self, threshold: float = 0.5, *args: Any, **kwargs: Any) -> None: 1085 """ 1086 Args: 1087 threshold: The document is removed if the ratio of short (<10 chars) lines are more than this value. 1088 *args: 1089 **kwargs: 1090 """ # noqa: E501 1091 super().__init__(*args, **kwargs) 1092 self.threshold = threshold 1093 # この値は適当に決め打ち 1094 self.minimum_line_length = 10 1095 1096 def apply(self, doc: Document) -> Document: 1097 lines = [len(x) for x in doc.text.split("\n")] 1098 short_lines = [x for x in lines if x <= self.minimum_line_length] 1099 if (len(short_lines) / len(lines)) > self.threshold: 1100 doc.is_rejected = True 1101 return doc
短い行を大量に含む文書を捨てるためのフィルタです.
メニューバーやパンくずリストのような要素を大量に含む文書を取り除くのに有効です.
1084 def __init__(self, threshold: float = 0.5, *args: Any, **kwargs: Any) -> None: 1085 """ 1086 Args: 1087 threshold: The document is removed if the ratio of short (<10 chars) lines are more than this value. 1088 *args: 1089 **kwargs: 1090 """ # noqa: E501 1091 super().__init__(*args, **kwargs) 1092 self.threshold = threshold 1093 # この値は適当に決め打ち 1094 self.minimum_line_length = 10
Args: threshold: The document is removed if the ratio of short (<10 chars) lines are more than this value. args: *kwargs:
1096 def apply(self, doc: Document) -> Document: 1097 lines = [len(x) for x in doc.text.split("\n")] 1098 short_lines = [x for x in lines if x <= self.minimum_line_length] 1099 if (len(short_lines) / len(lines)) > self.threshold: 1100 doc.is_rejected = True 1101 return doc
Definition of filter behavior.
The document must have a protocol TextContent
,
and mostly used hojichar.Document class.
In this method, the filter will modify document.text
or
document.extras
and set document.is_rejected = True
to discard the document.
Parameters
document : Document Input document
Returns
Document Processed Document