hojichar.core.composition
1import json 2import logging 3import pprint 4from typing import Any, List, Optional, Union 5 6import numpy as np 7 8from hojichar.core.filter_interface import Filter, TokenFilter 9from hojichar.core.inspection import Inspector, StatisticsCounter, StatsContainer 10from hojichar.core.models import Document 11 12 13class BeforeProcessFilter(Filter): 14 def apply(self, doc: Document) -> Document: 15 return doc 16 17 18class Compose(Filter): 19 def __init__( 20 self, 21 filters: List[Union[Filter, TokenFilter]], 22 random_state: Optional[Union[int, np.random.Generator]] = None, 23 *args: Any, 24 **kwargs: Any, 25 ) -> None: 26 """ 27 Compose a filter from pre-defined filter-objects. 28 Filter which has `skip_rejected` flag ignores a document which has `is_rejected` flag. 29 By doing so, Compose avoid applying filters that do not affect the output. 30 31 Parameters 32 ---------- 33 filters : List[Union[Filter, TokenFilter]] 34 Filter instances which apply to the corpus. 35 36 random_state : Union[None, int, np.random.Generator], optional 37 Default = None 38 Seed for applying filters randomly. 39 `random_state` must be int or np.random.Generator instance. 40 """ 41 super().__init__(*args, **kwargs) 42 self.set_filters(filters) 43 self.logger = logging.getLogger("hojichar.Compose") 44 self.before_process_inspector = Inspector( 45 target_filter=BeforeProcessFilter(), filter_idx=-1 46 ) 47 self.inspectors = [ 48 Inspector(target_filter=filter, filter_idx=idx) 49 for idx, filter in enumerate(self.filters) 50 ] 51 self._statistics = StatisticsCounter(self.inspectors) 52 53 # Turn random_state into a `np.random.Generator` instance. 54 if random_state is None: 55 self.rng = np.random.default_rng() 56 elif isinstance(random_state, int): 57 self.rng = np.random.default_rng(random_state) 58 elif isinstance(random_state, np.random.Generator): 59 self.rng = random_state 60 else: 61 raise ValueError(f"{random_state} cannot be used to seed.") 62 63 def set_filters(self, filters: List[Union[Filter, TokenFilter]]) -> None: 64 """ 65 Set the filter to a Compose object. The filter is expanded if the 66 list of filters in the argument contains a filter bound by Compose. 67 68 Args: 69 filters (List[Union[Filter, TokenFilter]]): Target filters 70 """ 71 self.filters: List[Union[Filter, TokenFilter]] = [] 72 for filter in filters: 73 if isinstance(filter, Compose): 74 self.filters.extend(filter.filters) 75 else: 76 self.filters.append(filter) 77 78 def __call__(self, text: str) -> str: 79 document = Document(text) 80 document = self.apply(document) 81 if document.is_rejected: 82 return "" 83 else: 84 return document.text 85 86 def _apply_filter(self, filt: Union[Filter, TokenFilter], document: Document) -> Document: 87 if document.is_rejected and filt.skip_rejected: 88 pass 89 else: 90 if filt.p == 1: 91 document = filt.apply_filter(document) 92 else: 93 if self.rng.random() < filt.p: 94 document = filt.apply_filter(document) 95 return document 96 97 def apply(self, document: Document) -> Document: 98 document = self.before_process_inspector.apply(document) 99 previous_inspector = self.before_process_inspector 100 for i, filt in enumerate(self.filters): 101 inspector = self.inspectors[i] 102 document = self._apply_filter(filt=filt, document=document) 103 document = inspector.apply(document) 104 if (not previous_inspector.is_rejected) and inspector.is_rejected: 105 document.reject_reason = filt.get_jsonalbe_vars(exclude_keys={"skip_rejected"}) 106 previous_inspector = inspector 107 108 self._statistics.update_changes(document, self.before_process_inspector, self.inspectors) 109 return document 110 111 @property 112 def statistics(self) -> dict: 113 return self._statistics.get_statistics() 114 115 @property 116 def statistics_obj(self) -> StatsContainer: 117 return self._statistics.stats 118 119 def summary(self, format: str = "print") -> None: 120 info = [ 121 { 122 "layer": i, 123 "name": filt.name, 124 "doc": filt.__doc__, 125 } 126 for i, filt in enumerate(self.filters) 127 ] 128 129 def to_json(filter_info: dict) -> dict: 130 filter_info["doc"] = "".join(d.strip() for d in filter_info["doc"].split("\n")) 131 return filter_info 132 133 if format == "json": 134 print(json.dumps(list(map(to_json, info)), ensure_ascii=False, indent="\t")) 135 if format == "print": 136 for layer in info: 137 print(f"[{layer['layer']}] {layer['name']}") 138 pprint.pprint(layer["doc"])
Base class for all filters. Document-level filters must inherit from this class.
The definition of filter function is in apply
method.
If you define a new filter, you must define the method.
When this class is called, apply the filter from string to string.
If the filter create Document.tokens
form Document.text
, you
must implement tokenize
method.
If the filter update Document.text
by merging Document.tokens
, you
must implement merge
method.
Do not define a filter that changes both Document.text
and Document.token
to prevent unexpected behavior.
If you apply the filter to tokens, you can use TokenFilter
class.
Parameters
p: float
The probability apply the filter organized by hojichar.Compose
skip_reject: bool
If set True
, hojichar.Compose
make this filter ignore the document
which has is_rejected
flag.
This flag is True
by default since processing discarded documents
in subsequent filters is meaningless. However, in some cases, docs that
have been rejected need another filter. For example, analyzing false-positive,
discarded docs must be passed to JSON Dump filters. In such case,
set the skip_reject
flag as False
and make it pass all docs.
Definition of filter behavior.
In this method, the filter will modify document.text
, or
set document.is_rejected = True
to discard the document.
Do not define a filter that changes both document.text
and document.token
Parameters
document : Document Input document
Returns
Document Processed Document
Inherited Members
19class Compose(Filter): 20 def __init__( 21 self, 22 filters: List[Union[Filter, TokenFilter]], 23 random_state: Optional[Union[int, np.random.Generator]] = None, 24 *args: Any, 25 **kwargs: Any, 26 ) -> None: 27 """ 28 Compose a filter from pre-defined filter-objects. 29 Filter which has `skip_rejected` flag ignores a document which has `is_rejected` flag. 30 By doing so, Compose avoid applying filters that do not affect the output. 31 32 Parameters 33 ---------- 34 filters : List[Union[Filter, TokenFilter]] 35 Filter instances which apply to the corpus. 36 37 random_state : Union[None, int, np.random.Generator], optional 38 Default = None 39 Seed for applying filters randomly. 40 `random_state` must be int or np.random.Generator instance. 41 """ 42 super().__init__(*args, **kwargs) 43 self.set_filters(filters) 44 self.logger = logging.getLogger("hojichar.Compose") 45 self.before_process_inspector = Inspector( 46 target_filter=BeforeProcessFilter(), filter_idx=-1 47 ) 48 self.inspectors = [ 49 Inspector(target_filter=filter, filter_idx=idx) 50 for idx, filter in enumerate(self.filters) 51 ] 52 self._statistics = StatisticsCounter(self.inspectors) 53 54 # Turn random_state into a `np.random.Generator` instance. 55 if random_state is None: 56 self.rng = np.random.default_rng() 57 elif isinstance(random_state, int): 58 self.rng = np.random.default_rng(random_state) 59 elif isinstance(random_state, np.random.Generator): 60 self.rng = random_state 61 else: 62 raise ValueError(f"{random_state} cannot be used to seed.") 63 64 def set_filters(self, filters: List[Union[Filter, TokenFilter]]) -> None: 65 """ 66 Set the filter to a Compose object. The filter is expanded if the 67 list of filters in the argument contains a filter bound by Compose. 68 69 Args: 70 filters (List[Union[Filter, TokenFilter]]): Target filters 71 """ 72 self.filters: List[Union[Filter, TokenFilter]] = [] 73 for filter in filters: 74 if isinstance(filter, Compose): 75 self.filters.extend(filter.filters) 76 else: 77 self.filters.append(filter) 78 79 def __call__(self, text: str) -> str: 80 document = Document(text) 81 document = self.apply(document) 82 if document.is_rejected: 83 return "" 84 else: 85 return document.text 86 87 def _apply_filter(self, filt: Union[Filter, TokenFilter], document: Document) -> Document: 88 if document.is_rejected and filt.skip_rejected: 89 pass 90 else: 91 if filt.p == 1: 92 document = filt.apply_filter(document) 93 else: 94 if self.rng.random() < filt.p: 95 document = filt.apply_filter(document) 96 return document 97 98 def apply(self, document: Document) -> Document: 99 document = self.before_process_inspector.apply(document) 100 previous_inspector = self.before_process_inspector 101 for i, filt in enumerate(self.filters): 102 inspector = self.inspectors[i] 103 document = self._apply_filter(filt=filt, document=document) 104 document = inspector.apply(document) 105 if (not previous_inspector.is_rejected) and inspector.is_rejected: 106 document.reject_reason = filt.get_jsonalbe_vars(exclude_keys={"skip_rejected"}) 107 previous_inspector = inspector 108 109 self._statistics.update_changes(document, self.before_process_inspector, self.inspectors) 110 return document 111 112 @property 113 def statistics(self) -> dict: 114 return self._statistics.get_statistics() 115 116 @property 117 def statistics_obj(self) -> StatsContainer: 118 return self._statistics.stats 119 120 def summary(self, format: str = "print") -> None: 121 info = [ 122 { 123 "layer": i, 124 "name": filt.name, 125 "doc": filt.__doc__, 126 } 127 for i, filt in enumerate(self.filters) 128 ] 129 130 def to_json(filter_info: dict) -> dict: 131 filter_info["doc"] = "".join(d.strip() for d in filter_info["doc"].split("\n")) 132 return filter_info 133 134 if format == "json": 135 print(json.dumps(list(map(to_json, info)), ensure_ascii=False, indent="\t")) 136 if format == "print": 137 for layer in info: 138 print(f"[{layer['layer']}] {layer['name']}") 139 pprint.pprint(layer["doc"])
Base class for all filters. Document-level filters must inherit from this class.
The definition of filter function is in apply
method.
If you define a new filter, you must define the method.
When this class is called, apply the filter from string to string.
If the filter create Document.tokens
form Document.text
, you
must implement tokenize
method.
If the filter update Document.text
by merging Document.tokens
, you
must implement merge
method.
Do not define a filter that changes both Document.text
and Document.token
to prevent unexpected behavior.
If you apply the filter to tokens, you can use TokenFilter
class.
Parameters
p: float
The probability apply the filter organized by hojichar.Compose
skip_reject: bool
If set True
, hojichar.Compose
make this filter ignore the document
which has is_rejected
flag.
This flag is True
by default since processing discarded documents
in subsequent filters is meaningless. However, in some cases, docs that
have been rejected need another filter. For example, analyzing false-positive,
discarded docs must be passed to JSON Dump filters. In such case,
set the skip_reject
flag as False
and make it pass all docs.
20 def __init__( 21 self, 22 filters: List[Union[Filter, TokenFilter]], 23 random_state: Optional[Union[int, np.random.Generator]] = None, 24 *args: Any, 25 **kwargs: Any, 26 ) -> None: 27 """ 28 Compose a filter from pre-defined filter-objects. 29 Filter which has `skip_rejected` flag ignores a document which has `is_rejected` flag. 30 By doing so, Compose avoid applying filters that do not affect the output. 31 32 Parameters 33 ---------- 34 filters : List[Union[Filter, TokenFilter]] 35 Filter instances which apply to the corpus. 36 37 random_state : Union[None, int, np.random.Generator], optional 38 Default = None 39 Seed for applying filters randomly. 40 `random_state` must be int or np.random.Generator instance. 41 """ 42 super().__init__(*args, **kwargs) 43 self.set_filters(filters) 44 self.logger = logging.getLogger("hojichar.Compose") 45 self.before_process_inspector = Inspector( 46 target_filter=BeforeProcessFilter(), filter_idx=-1 47 ) 48 self.inspectors = [ 49 Inspector(target_filter=filter, filter_idx=idx) 50 for idx, filter in enumerate(self.filters) 51 ] 52 self._statistics = StatisticsCounter(self.inspectors) 53 54 # Turn random_state into a `np.random.Generator` instance. 55 if random_state is None: 56 self.rng = np.random.default_rng() 57 elif isinstance(random_state, int): 58 self.rng = np.random.default_rng(random_state) 59 elif isinstance(random_state, np.random.Generator): 60 self.rng = random_state 61 else: 62 raise ValueError(f"{random_state} cannot be used to seed.")
Compose a filter from pre-defined filter-objects.
Filter which has skip_rejected
flag ignores a document which has is_rejected
flag.
By doing so, Compose avoid applying filters that do not affect the output.
Parameters
filters : List[Union[Filter, TokenFilter]] Filter instances which apply to the corpus.
random_state : Union[None, int, np.random.Generator], optional
Default = None
Seed for applying filters randomly.
random_state
must be int or np.random.Generator instance.
64 def set_filters(self, filters: List[Union[Filter, TokenFilter]]) -> None: 65 """ 66 Set the filter to a Compose object. The filter is expanded if the 67 list of filters in the argument contains a filter bound by Compose. 68 69 Args: 70 filters (List[Union[Filter, TokenFilter]]): Target filters 71 """ 72 self.filters: List[Union[Filter, TokenFilter]] = [] 73 for filter in filters: 74 if isinstance(filter, Compose): 75 self.filters.extend(filter.filters) 76 else: 77 self.filters.append(filter)
Set the filter to a Compose object. The filter is expanded if the list of filters in the argument contains a filter bound by Compose.
Args: filters (List[Union[Filter, TokenFilter]]): Target filters
98 def apply(self, document: Document) -> Document: 99 document = self.before_process_inspector.apply(document) 100 previous_inspector = self.before_process_inspector 101 for i, filt in enumerate(self.filters): 102 inspector = self.inspectors[i] 103 document = self._apply_filter(filt=filt, document=document) 104 document = inspector.apply(document) 105 if (not previous_inspector.is_rejected) and inspector.is_rejected: 106 document.reject_reason = filt.get_jsonalbe_vars(exclude_keys={"skip_rejected"}) 107 previous_inspector = inspector 108 109 self._statistics.update_changes(document, self.before_process_inspector, self.inspectors) 110 return document
Definition of filter behavior.
In this method, the filter will modify document.text
, or
set document.is_rejected = True
to discard the document.
Do not define a filter that changes both document.text
and document.token
Parameters
document : Document Input document
Returns
Document Processed Document
120 def summary(self, format: str = "print") -> None: 121 info = [ 122 { 123 "layer": i, 124 "name": filt.name, 125 "doc": filt.__doc__, 126 } 127 for i, filt in enumerate(self.filters) 128 ] 129 130 def to_json(filter_info: dict) -> dict: 131 filter_info["doc"] = "".join(d.strip() for d in filter_info["doc"].split("\n")) 132 return filter_info 133 134 if format == "json": 135 print(json.dumps(list(map(to_json, info)), ensure_ascii=False, indent="\t")) 136 if format == "print": 137 for layer in info: 138 print(f"[{layer['layer']}] {layer['name']}") 139 pprint.pprint(layer["doc"])