hojichar.filters.language_identification
1import hashlib 2import logging 3import os 4import time 5from os import PathLike 6from pathlib import Path 7from typing import Any, Tuple, Union 8 9try: 10 import requests 11 from fasttext import load_model # type: ignore 12 13 is_loaded_extras = True 14except ImportError: 15 is_loaded_extras = False 16 17from tqdm import tqdm 18 19from hojichar import Document, Filter 20 21logger = logging.getLogger(__name__) 22 23 24FASTTEXT_MODEL_URL = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin" 25MODEL_CHECKSUM = "01810bc59c6a3d2b79c79e6336612f65" 26 27 28def _download_with_progress_bar( 29 download_url: str, save_path: Union[str, PathLike], retries: int = 3, delay: float = 1.0 30) -> None: 31 # HACK type hint `os.PathLike[str]` is not allowed in Python 3.8 or older. 32 # So I write Union[str, PathLike]. In the future, I will use `os.PathLike[str]` or simply Path. 33 try: 34 Path(save_path).parent.mkdir(parents=True, exist_ok=True) 35 with requests.get(download_url, stream=True) as r: 36 r.raise_for_status() 37 total_size = int(r.headers.get("content-length", 0)) 38 with tqdm(total=total_size, unit="B", unit_scale=True) as pbar: 39 with open(save_path, "wb") as f: 40 for chunk in r.iter_content(chunk_size=8192): 41 f.write(chunk) 42 pbar.update(len(chunk)) 43 except requests.RequestException as e: 44 if retries > 0: 45 logger.warning( 46 f"Download failed, retrying in {delay} seconds... ({retries} retries left)" 47 ) 48 time.sleep(delay) 49 _download_with_progress_bar(download_url, save_path, retries - 1, delay) 50 else: 51 logger.error(f"Download failed after retries: {e}") 52 raise 53 54 55def _get_md5_hash_of_file(file_path: Union[str, PathLike]) -> str: 56 """ 57 Function to calculate the MD5 hash of a file. 58 59 Read the file in chunks to avoid loading large files into memory. 60 cf. https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file 61 """ 62 md5_hash = hashlib.md5() 63 with open(file_path, "rb") as file: 64 while chunk := file.read(8192): 65 md5_hash.update(chunk) 66 return md5_hash.hexdigest() 67 68 69def _download_fasttext_model(model_path: Union[str, PathLike]) -> None: 70 logger.info(f"Downloading fasttext model from {FASTTEXT_MODEL_URL} to {model_path}...") 71 _download_with_progress_bar(FASTTEXT_MODEL_URL, model_path) 72 73 74class LanguageIdentificationByFastText(Filter): 75 """ 76 A filter that removes non-Japanese text 77 using Language IDentification (LID). 78 Download the fastText model for LID: https://fasttext.cc/docs/en/language-identification.html 79 The available languages are: 80 af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr ca cbk ce ceb 81 ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa fi fr frr fy ga gd gl gn gom 82 gu gv he hi hif hr hsb ht hu hy ia id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv 83 kw ky la lb lez li lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn 84 nah nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah 85 sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec 86 vep vi vls vo wa war wuu xal xmf yi yo yue zh 87 """ 88 89 def __init__( 90 self, 91 language: str, 92 lang_score_threshold: float = 0.50, 93 model_path: Union[str, PathLike, None] = None, 94 *args: Any, 95 **kwargs: Any, 96 ) -> None: 97 """ 98 Args: 99 language: the language to be accepted. 100 lang_score_threshold: the document whose fasttext score is 101 below this threshold will be discarded. 102 A default value is 0.50, which is empirically very generous value, 103 i.e., almost all documents are accepted if the document 104 is classified as japanese text. 105 model_path: The directory path, which the model saved in. 106 If None, the model will be saved in the current directory. 107 *args: 108 **kwargs: 109 """ 110 super().__init__(*args, **kwargs) 111 if not is_loaded_extras: 112 raise ImportError( 113 "The `fasttext` package is required to use this filter. " 114 "Please install it by running `pip install hojichar[all]`" 115 "or `pip install fasttext requests`." 116 ) 117 118 self.lang_score_threshold = lang_score_threshold 119 self.language = language 120 121 self.model_path = Path(model_path) if model_path else Path(os.getcwd()) / "lid.176.bin" 122 123 if not self.model_path.exists(): 124 logger.info("Fasttext model file was not found.") 125 _download_fasttext_model(self.model_path) 126 127 assert _get_md5_hash_of_file(self.model_path) == MODEL_CHECKSUM, ( 128 f"Checksum of the downloaded model file does not match the expected value. " 129 f"Expected: {MODEL_CHECKSUM}, " 130 f"Actual: {_get_md5_hash_of_file(self.model_path)}" 131 ) 132 self.model = load_model(str(self.model_path)) 133 134 def _predict_language(self, text: str) -> Tuple[str, float]: 135 # fasttext cannot handle multiline input 136 # so we must remove the newline character 137 text = text.strip().replace("\n", " ") 138 pred = self.model.predict(text) 139 pred_lang = pred[0][0].replace("__label__", "") 140 pred_score = pred[1][0] 141 return pred_lang, pred_score 142 143 def apply(self, doc: Document) -> Document: 144 pred_lang, score = self._predict_language(doc.text) 145 if not (pred_lang == self.language and score >= self.lang_score_threshold): 146 doc.is_rejected = True 147 return doc 148 149 150class AcceptJapaneseByFastText(LanguageIdentificationByFastText): 151 """ 152 A filter that removes non-Japanese text via Language Identification (LID) by FastText. 153 154 >>> AcceptJapaneseByFastText().apply(Document("This is English document")).is_rejected 155 True 156 >>> AcceptJapaneseByFastText().apply(Document("自然言語処理大好き!")).is_rejected 157 False 158 >>> AcceptJapaneseByFastText().apply(Document("快三手机投注平台代理")).is_rejected 159 True 160 """ 161 162 def __init__( 163 self, 164 lang_score_threshold: float = 0.50, 165 model_path: Union[str, PathLike, None] = None, 166 *args: Any, 167 **kwargs: Any, 168 ) -> None: 169 super().__init__("ja", lang_score_threshold, model_path, *args, **kwargs)
75class LanguageIdentificationByFastText(Filter): 76 """ 77 A filter that removes non-Japanese text 78 using Language IDentification (LID). 79 Download the fastText model for LID: https://fasttext.cc/docs/en/language-identification.html 80 The available languages are: 81 af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr ca cbk ce ceb 82 ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa fi fr frr fy ga gd gl gn gom 83 gu gv he hi hif hr hsb ht hu hy ia id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv 84 kw ky la lb lez li lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn 85 nah nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah 86 sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec 87 vep vi vls vo wa war wuu xal xmf yi yo yue zh 88 """ 89 90 def __init__( 91 self, 92 language: str, 93 lang_score_threshold: float = 0.50, 94 model_path: Union[str, PathLike, None] = None, 95 *args: Any, 96 **kwargs: Any, 97 ) -> None: 98 """ 99 Args: 100 language: the language to be accepted. 101 lang_score_threshold: the document whose fasttext score is 102 below this threshold will be discarded. 103 A default value is 0.50, which is empirically very generous value, 104 i.e., almost all documents are accepted if the document 105 is classified as japanese text. 106 model_path: The directory path, which the model saved in. 107 If None, the model will be saved in the current directory. 108 *args: 109 **kwargs: 110 """ 111 super().__init__(*args, **kwargs) 112 if not is_loaded_extras: 113 raise ImportError( 114 "The `fasttext` package is required to use this filter. " 115 "Please install it by running `pip install hojichar[all]`" 116 "or `pip install fasttext requests`." 117 ) 118 119 self.lang_score_threshold = lang_score_threshold 120 self.language = language 121 122 self.model_path = Path(model_path) if model_path else Path(os.getcwd()) / "lid.176.bin" 123 124 if not self.model_path.exists(): 125 logger.info("Fasttext model file was not found.") 126 _download_fasttext_model(self.model_path) 127 128 assert _get_md5_hash_of_file(self.model_path) == MODEL_CHECKSUM, ( 129 f"Checksum of the downloaded model file does not match the expected value. " 130 f"Expected: {MODEL_CHECKSUM}, " 131 f"Actual: {_get_md5_hash_of_file(self.model_path)}" 132 ) 133 self.model = load_model(str(self.model_path)) 134 135 def _predict_language(self, text: str) -> Tuple[str, float]: 136 # fasttext cannot handle multiline input 137 # so we must remove the newline character 138 text = text.strip().replace("\n", " ") 139 pred = self.model.predict(text) 140 pred_lang = pred[0][0].replace("__label__", "") 141 pred_score = pred[1][0] 142 return pred_lang, pred_score 143 144 def apply(self, doc: Document) -> Document: 145 pred_lang, score = self._predict_language(doc.text) 146 if not (pred_lang == self.language and score >= self.lang_score_threshold): 147 doc.is_rejected = True 148 return doc
A filter that removes non-Japanese text using Language IDentification (LID). Download the fastText model for LID: https://fasttext.cc/docs/en/language-identification.html The available languages are: af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh
90 def __init__( 91 self, 92 language: str, 93 lang_score_threshold: float = 0.50, 94 model_path: Union[str, PathLike, None] = None, 95 *args: Any, 96 **kwargs: Any, 97 ) -> None: 98 """ 99 Args: 100 language: the language to be accepted. 101 lang_score_threshold: the document whose fasttext score is 102 below this threshold will be discarded. 103 A default value is 0.50, which is empirically very generous value, 104 i.e., almost all documents are accepted if the document 105 is classified as japanese text. 106 model_path: The directory path, which the model saved in. 107 If None, the model will be saved in the current directory. 108 *args: 109 **kwargs: 110 """ 111 super().__init__(*args, **kwargs) 112 if not is_loaded_extras: 113 raise ImportError( 114 "The `fasttext` package is required to use this filter. " 115 "Please install it by running `pip install hojichar[all]`" 116 "or `pip install fasttext requests`." 117 ) 118 119 self.lang_score_threshold = lang_score_threshold 120 self.language = language 121 122 self.model_path = Path(model_path) if model_path else Path(os.getcwd()) / "lid.176.bin" 123 124 if not self.model_path.exists(): 125 logger.info("Fasttext model file was not found.") 126 _download_fasttext_model(self.model_path) 127 128 assert _get_md5_hash_of_file(self.model_path) == MODEL_CHECKSUM, ( 129 f"Checksum of the downloaded model file does not match the expected value. " 130 f"Expected: {MODEL_CHECKSUM}, " 131 f"Actual: {_get_md5_hash_of_file(self.model_path)}" 132 ) 133 self.model = load_model(str(self.model_path))
Args: language: the language to be accepted. lang_score_threshold: the document whose fasttext score is below this threshold will be discarded. A default value is 0.50, which is empirically very generous value, i.e., almost all documents are accepted if the document is classified as japanese text. model_path: The directory path, which the model saved in. If None, the model will be saved in the current directory. args: *kwargs:
144 def apply(self, doc: Document) -> Document: 145 pred_lang, score = self._predict_language(doc.text) 146 if not (pred_lang == self.language and score >= self.lang_score_threshold): 147 doc.is_rejected = True 148 return doc
Definition of filter behavior.
The document must have a protocol TextContent
,
and mostly used hojichar.Document class.
In this method, the filter will modify document.text
or
document.extras
and set document.is_rejected = True
to discard the document.
Parameters
document : Document Input document
Returns
Document Processed Document
151class AcceptJapaneseByFastText(LanguageIdentificationByFastText): 152 """ 153 A filter that removes non-Japanese text via Language Identification (LID) by FastText. 154 155 >>> AcceptJapaneseByFastText().apply(Document("This is English document")).is_rejected 156 True 157 >>> AcceptJapaneseByFastText().apply(Document("自然言語処理大好き!")).is_rejected 158 False 159 >>> AcceptJapaneseByFastText().apply(Document("快三手机投注平台代理")).is_rejected 160 True 161 """ 162 163 def __init__( 164 self, 165 lang_score_threshold: float = 0.50, 166 model_path: Union[str, PathLike, None] = None, 167 *args: Any, 168 **kwargs: Any, 169 ) -> None: 170 super().__init__("ja", lang_score_threshold, model_path, *args, **kwargs)
A filter that removes non-Japanese text via Language Identification (LID) by FastText.
>>> AcceptJapaneseByFastText().apply(Document("This is English document")).is_rejected
True
>>> AcceptJapaneseByFastText().apply(Document("自然言語処理大好き!")).is_rejected
False
>>> AcceptJapaneseByFastText().apply(Document("快三手机投注平台代理")).is_rejected
True
163 def __init__( 164 self, 165 lang_score_threshold: float = 0.50, 166 model_path: Union[str, PathLike, None] = None, 167 *args: Any, 168 **kwargs: Any, 169 ) -> None: 170 super().__init__("ja", lang_score_threshold, model_path, *args, **kwargs)
Args: language: the language to be accepted. lang_score_threshold: the document whose fasttext score is below this threshold will be discarded. A default value is 0.50, which is empirically very generous value, i.e., almost all documents are accepted if the document is classified as japanese text. model_path: The directory path, which the model saved in. If None, the model will be saved in the current directory. args: *kwargs: