hojichar.core.filter_interface
1import logging 2from typing import Any, Dict, Optional, Set 3 4from hojichar.core.models import Document, Token 5 6 7def _is_jsonable(data: Any) -> bool: 8 if data is None: 9 return True 10 elif isinstance(data, (bool, int, float, str)): 11 return True 12 """ 13 elif isinstance(data, (tuple, list)): 14 return all(Filter._is_jsonable(x) for x in data) 15 elif isinstance(data, dict): 16 return all(isinstance(k, str) and Filter._is_jsonable(v) for k, v in data.items()) 17 """ 18 return False 19 20 21class Filter: 22 """ 23 Base class for all filters. 24 Document-level filters must inherit from this class. 25 26 The definition of filter function is in `apply` method. 27 If you define a new filter, you must define the method. 28 When this class is called, apply the filter from string to string. 29 30 If the filter create `Document.tokens` form `Document.text`, you 31 must implement `tokenize` method. 32 If the filter update `Document.text` by merging `Document.tokens`, you 33 must implement `merge` method. 34 Do not define a filter that changes both `Document.text` and `Document.token` 35 to prevent unexpected behavior. 36 37 If you apply the filter to tokens, you can use `TokenFilter` class. 38 39 Parameters 40 ---------- 41 p: float 42 The probability apply the filter organized by hojichar.Compose 43 skip_reject: bool 44 If set `True`, `hojichar.Compose` make this filter ignore the document 45 which has `is_rejected` flag. 46 This flag is `True` by default since processing discarded documents 47 in subsequent filters is meaningless. However, in some cases, docs that 48 have been rejected need another filter. For example, analyzing false-positive, 49 discarded docs must be passed to JSON Dump filters. In such case, 50 set the `skip_reject` flag as `False` and make it pass all docs. 51 """ 52 53 def __init__( 54 self, p: float = 1, skip_rejected: bool = True, *args: Any, **kwargs: Any 55 ) -> None: 56 """ 57 Parameters 58 ---------- 59 p : float, optional 60 Probability that this filter will be applied. Default=1 61 """ 62 self.name = self.__class__.__name__ 63 self.logger = logging.getLogger("hojichar.document_filters." + self.name) 64 assert 0 <= p <= 1 65 self.p = p 66 self.skip_rejected = skip_rejected 67 68 def apply(self, document: Document) -> Document: 69 """Definition of filter behavior. 70 71 In this method, the filter will modify `document.text`, or 72 set `document.is_rejected = True` to discard the document. 73 74 Do not define a filter that changes both `document.text` and `document.token` 75 76 Parameters 77 ---------- 78 document : Document 79 Input document 80 81 Returns 82 ------- 83 Document 84 Processed Document 85 """ 86 raise NotImplementedError(f"{self.__class__.__name__}.apply method is not defined") 87 return document 88 89 def apply_filter(self, document: Document) -> Document: 90 document = self.apply(document) 91 return document 92 93 def __call__(self, text: str) -> str: 94 document = Document(text) 95 document = self.apply(document) 96 return document.text 97 98 def get_jsonalbe_vars(self, exclude_keys: Optional[Set[str]] = None) -> Dict[str, Any]: 99 """ 100 Get the member variable of this filter. 101 Eligible variables are primitive types; [bool, int, float, str, None], 102 and the name of the variable not starts with the underscore; `_`. 103 """ 104 if exclude_keys is None: 105 exclude_keys = set() 106 return { 107 k: v 108 for k, v in vars(self).items() 109 if (_is_jsonable(v) and (k not in exclude_keys) and (not k.startswith("_"))) 110 } 111 112 113class TokenFilter: 114 """ 115 Base class for token-level filters. 116 117 Token filters, which shuld be implemented in hojichar/filters/token_filters.py, 118 must inherit from this class. 119 """ 120 121 def __init__( 122 self, p: float = 1, skip_rejected: bool = True, *args: Any, **kwargs: Any 123 ) -> None: 124 self.name = self.__class__.__name__ 125 self.logger = logging.getLogger("hojichar.token_filters." + self.name) 126 assert 0 <= p <= 1 127 self.p = p 128 self.skip_rejected = skip_rejected 129 130 def apply(self, token: Token) -> Token: 131 raise NotImplementedError(f"{self.__class__.__name__}.apply method is not defined") 132 return token 133 134 def apply_filter(self, document: Document) -> Document: 135 document.tokens = [self.apply(token) for token in document.tokens if not token.is_rejected] 136 return document 137 138 def __call__(self, text: str) -> str: 139 token = Token(text) 140 token = self.apply(token) 141 return token.text 142 143 def get_jsonable_vars(self) -> dict: 144 # Output key-values of member variables that can be obtained by var(self), except "logger". 145 exclude_keys = ["logger"] 146 return dict(filter(lambda item: item[0] not in exclude_keys, vars(self).items())) 147 148 def get_jsonalbe_vars(self, exclude_keys: Optional[Set[str]] = None) -> dict: 149 """ 150 Get the member variable of this filter. 151 Eligible variables are primitive types; [bool, int, float, str, None], 152 and the name of the variable not starts with the underscore; `_`. 153 """ 154 if exclude_keys is None: 155 exclude_keys = set() 156 return { 157 k: v 158 for k, v in vars(self).items() 159 if (_is_jsonable(v) and (k not in exclude_keys) and (not k.startswith("_"))) 160 }
22class Filter: 23 """ 24 Base class for all filters. 25 Document-level filters must inherit from this class. 26 27 The definition of filter function is in `apply` method. 28 If you define a new filter, you must define the method. 29 When this class is called, apply the filter from string to string. 30 31 If the filter create `Document.tokens` form `Document.text`, you 32 must implement `tokenize` method. 33 If the filter update `Document.text` by merging `Document.tokens`, you 34 must implement `merge` method. 35 Do not define a filter that changes both `Document.text` and `Document.token` 36 to prevent unexpected behavior. 37 38 If you apply the filter to tokens, you can use `TokenFilter` class. 39 40 Parameters 41 ---------- 42 p: float 43 The probability apply the filter organized by hojichar.Compose 44 skip_reject: bool 45 If set `True`, `hojichar.Compose` make this filter ignore the document 46 which has `is_rejected` flag. 47 This flag is `True` by default since processing discarded documents 48 in subsequent filters is meaningless. However, in some cases, docs that 49 have been rejected need another filter. For example, analyzing false-positive, 50 discarded docs must be passed to JSON Dump filters. In such case, 51 set the `skip_reject` flag as `False` and make it pass all docs. 52 """ 53 54 def __init__( 55 self, p: float = 1, skip_rejected: bool = True, *args: Any, **kwargs: Any 56 ) -> None: 57 """ 58 Parameters 59 ---------- 60 p : float, optional 61 Probability that this filter will be applied. Default=1 62 """ 63 self.name = self.__class__.__name__ 64 self.logger = logging.getLogger("hojichar.document_filters." + self.name) 65 assert 0 <= p <= 1 66 self.p = p 67 self.skip_rejected = skip_rejected 68 69 def apply(self, document: Document) -> Document: 70 """Definition of filter behavior. 71 72 In this method, the filter will modify `document.text`, or 73 set `document.is_rejected = True` to discard the document. 74 75 Do not define a filter that changes both `document.text` and `document.token` 76 77 Parameters 78 ---------- 79 document : Document 80 Input document 81 82 Returns 83 ------- 84 Document 85 Processed Document 86 """ 87 raise NotImplementedError(f"{self.__class__.__name__}.apply method is not defined") 88 return document 89 90 def apply_filter(self, document: Document) -> Document: 91 document = self.apply(document) 92 return document 93 94 def __call__(self, text: str) -> str: 95 document = Document(text) 96 document = self.apply(document) 97 return document.text 98 99 def get_jsonalbe_vars(self, exclude_keys: Optional[Set[str]] = None) -> Dict[str, Any]: 100 """ 101 Get the member variable of this filter. 102 Eligible variables are primitive types; [bool, int, float, str, None], 103 and the name of the variable not starts with the underscore; `_`. 104 """ 105 if exclude_keys is None: 106 exclude_keys = set() 107 return { 108 k: v 109 for k, v in vars(self).items() 110 if (_is_jsonable(v) and (k not in exclude_keys) and (not k.startswith("_"))) 111 }
Base class for all filters. Document-level filters must inherit from this class.
The definition of filter function is in apply
method.
If you define a new filter, you must define the method.
When this class is called, apply the filter from string to string.
If the filter create Document.tokens
form Document.text
, you
must implement tokenize
method.
If the filter update Document.text
by merging Document.tokens
, you
must implement merge
method.
Do not define a filter that changes both Document.text
and Document.token
to prevent unexpected behavior.
If you apply the filter to tokens, you can use TokenFilter
class.
Parameters
p: float
The probability apply the filter organized by hojichar.Compose
skip_reject: bool
If set True
, hojichar.Compose
make this filter ignore the document
which has is_rejected
flag.
This flag is True
by default since processing discarded documents
in subsequent filters is meaningless. However, in some cases, docs that
have been rejected need another filter. For example, analyzing false-positive,
discarded docs must be passed to JSON Dump filters. In such case,
set the skip_reject
flag as False
and make it pass all docs.
54 def __init__( 55 self, p: float = 1, skip_rejected: bool = True, *args: Any, **kwargs: Any 56 ) -> None: 57 """ 58 Parameters 59 ---------- 60 p : float, optional 61 Probability that this filter will be applied. Default=1 62 """ 63 self.name = self.__class__.__name__ 64 self.logger = logging.getLogger("hojichar.document_filters." + self.name) 65 assert 0 <= p <= 1 66 self.p = p 67 self.skip_rejected = skip_rejected
Parameters
p : float, optional Probability that this filter will be applied. Default=1
69 def apply(self, document: Document) -> Document: 70 """Definition of filter behavior. 71 72 In this method, the filter will modify `document.text`, or 73 set `document.is_rejected = True` to discard the document. 74 75 Do not define a filter that changes both `document.text` and `document.token` 76 77 Parameters 78 ---------- 79 document : Document 80 Input document 81 82 Returns 83 ------- 84 Document 85 Processed Document 86 """ 87 raise NotImplementedError(f"{self.__class__.__name__}.apply method is not defined") 88 return document
Definition of filter behavior.
In this method, the filter will modify document.text
, or
set document.is_rejected = True
to discard the document.
Do not define a filter that changes both document.text
and document.token
Parameters
document : Document Input document
Returns
Document Processed Document
99 def get_jsonalbe_vars(self, exclude_keys: Optional[Set[str]] = None) -> Dict[str, Any]: 100 """ 101 Get the member variable of this filter. 102 Eligible variables are primitive types; [bool, int, float, str, None], 103 and the name of the variable not starts with the underscore; `_`. 104 """ 105 if exclude_keys is None: 106 exclude_keys = set() 107 return { 108 k: v 109 for k, v in vars(self).items() 110 if (_is_jsonable(v) and (k not in exclude_keys) and (not k.startswith("_"))) 111 }
Get the member variable of this filter.
Eligible variables are primitive types; [bool, int, float, str, None],
and the name of the variable not starts with the underscore; _
.
114class TokenFilter: 115 """ 116 Base class for token-level filters. 117 118 Token filters, which shuld be implemented in hojichar/filters/token_filters.py, 119 must inherit from this class. 120 """ 121 122 def __init__( 123 self, p: float = 1, skip_rejected: bool = True, *args: Any, **kwargs: Any 124 ) -> None: 125 self.name = self.__class__.__name__ 126 self.logger = logging.getLogger("hojichar.token_filters." + self.name) 127 assert 0 <= p <= 1 128 self.p = p 129 self.skip_rejected = skip_rejected 130 131 def apply(self, token: Token) -> Token: 132 raise NotImplementedError(f"{self.__class__.__name__}.apply method is not defined") 133 return token 134 135 def apply_filter(self, document: Document) -> Document: 136 document.tokens = [self.apply(token) for token in document.tokens if not token.is_rejected] 137 return document 138 139 def __call__(self, text: str) -> str: 140 token = Token(text) 141 token = self.apply(token) 142 return token.text 143 144 def get_jsonable_vars(self) -> dict: 145 # Output key-values of member variables that can be obtained by var(self), except "logger". 146 exclude_keys = ["logger"] 147 return dict(filter(lambda item: item[0] not in exclude_keys, vars(self).items())) 148 149 def get_jsonalbe_vars(self, exclude_keys: Optional[Set[str]] = None) -> dict: 150 """ 151 Get the member variable of this filter. 152 Eligible variables are primitive types; [bool, int, float, str, None], 153 and the name of the variable not starts with the underscore; `_`. 154 """ 155 if exclude_keys is None: 156 exclude_keys = set() 157 return { 158 k: v 159 for k, v in vars(self).items() 160 if (_is_jsonable(v) and (k not in exclude_keys) and (not k.startswith("_"))) 161 }
Base class for token-level filters.
Token filters, which shuld be implemented in hojichar/filters/token_filters.py, must inherit from this class.
122 def __init__( 123 self, p: float = 1, skip_rejected: bool = True, *args: Any, **kwargs: Any 124 ) -> None: 125 self.name = self.__class__.__name__ 126 self.logger = logging.getLogger("hojichar.token_filters." + self.name) 127 assert 0 <= p <= 1 128 self.p = p 129 self.skip_rejected = skip_rejected
149 def get_jsonalbe_vars(self, exclude_keys: Optional[Set[str]] = None) -> dict: 150 """ 151 Get the member variable of this filter. 152 Eligible variables are primitive types; [bool, int, float, str, None], 153 and the name of the variable not starts with the underscore; `_`. 154 """ 155 if exclude_keys is None: 156 exclude_keys = set() 157 return { 158 k: v 159 for k, v in vars(self).items() 160 if (_is_jsonable(v) and (k not in exclude_keys) and (not k.startswith("_"))) 161 }
Get the member variable of this filter.
Eligible variables are primitive types; [bool, int, float, str, None],
and the name of the variable not starts with the underscore; _
.