hojichar.core.composition

  1import json
  2import logging
  3import pprint
  4from typing import Any, List, Optional, Union
  5
  6import numpy as np
  7
  8from hojichar.core.filter_interface import Filter, TokenFilter
  9from hojichar.core.inspection import Inspector, StatisticsCounter, StatsContainer
 10from hojichar.core.models import Document
 11
 12
 13class BeforeProcessFilter(Filter):
 14    def apply(self, doc: Document) -> Document:
 15        return doc
 16
 17
 18class Compose(Filter):
 19    def __init__(
 20        self,
 21        filters: List[Union[Filter, TokenFilter]],
 22        random_state: Optional[Union[int, np.random.Generator]] = None,
 23        *args: Any,
 24        **kwargs: Any,
 25    ) -> None:
 26        """
 27        Compose a filter from pre-defined filter-objects.
 28        Filter which has `skip_rejected` flag ignores a document which has `is_rejected` flag.
 29        By doing so, Compose avoid applying filters that do not affect the output.
 30
 31        Parameters
 32        ----------
 33        filters : List[Union[Filter, TokenFilter]]
 34            Filter instances which apply to the corpus.
 35
 36        random_state : Union[None, int, np.random.Generator], optional
 37            Default = None
 38            Seed for applying filters randomly.
 39            `random_state` must be int or np.random.Generator instance.
 40        """
 41        super().__init__(*args, **kwargs)
 42        self.set_filters(filters)
 43        self.logger = logging.getLogger("hojichar.Compose")
 44        self.before_process_inspector = Inspector(
 45            target_filter=BeforeProcessFilter(), filter_idx=-1
 46        )
 47        self.inspectors = [
 48            Inspector(target_filter=filter, filter_idx=idx)
 49            for idx, filter in enumerate(self.filters)
 50        ]
 51        self._statistics = StatisticsCounter(self.inspectors)
 52
 53        # Turn random_state into a `np.random.Generator` instance.
 54        if random_state is None:
 55            self.rng = np.random.default_rng()
 56        elif isinstance(random_state, int):
 57            self.rng = np.random.default_rng(random_state)
 58        elif isinstance(random_state, np.random.Generator):
 59            self.rng = random_state
 60        else:
 61            raise ValueError(f"{random_state} cannot be used to seed.")
 62
 63    def set_filters(self, filters: List[Union[Filter, TokenFilter]]) -> None:
 64        """
 65        Set the filter to a Compose object. The filter is expanded if the
 66        list of filters in the argument contains a filter bound by Compose.
 67
 68        Args:
 69            filters (List[Union[Filter, TokenFilter]]): Target filters
 70        """
 71        self.filters: List[Union[Filter, TokenFilter]] = []
 72        for filter in filters:
 73            if isinstance(filter, Compose):
 74                self.filters.extend(filter.filters)
 75            else:
 76                self.filters.append(filter)
 77
 78    def __call__(self, text: str) -> str:
 79        document = Document(text)
 80        document = self.apply(document)
 81        if document.is_rejected:
 82            return ""
 83        else:
 84            return document.text
 85
 86    def _apply_filter(self, filt: Union[Filter, TokenFilter], document: Document) -> Document:
 87        if document.is_rejected and filt.skip_rejected:
 88            pass
 89        else:
 90            if filt.p == 1:
 91                document = filt.apply_filter(document)
 92            else:
 93                if self.rng.random() < filt.p:
 94                    document = filt.apply_filter(document)
 95        return document
 96
 97    def apply(self, document: Document) -> Document:
 98        document = self.before_process_inspector.apply(document)
 99        previous_inspector = self.before_process_inspector
100        for i, filt in enumerate(self.filters):
101            inspector = self.inspectors[i]
102            document = self._apply_filter(filt=filt, document=document)
103            document = inspector.apply(document)
104            if (not previous_inspector.is_rejected) and inspector.is_rejected:
105                document.reject_reason = filt.get_jsonalbe_vars(exclude_keys={"skip_rejected"})
106            previous_inspector = inspector
107
108        self._statistics.update_changes(document, self.before_process_inspector, self.inspectors)
109        return document
110
111    @property
112    def statistics(self) -> dict:
113        return self._statistics.get_statistics()
114
115    @property
116    def statistics_obj(self) -> StatsContainer:
117        return self._statistics.stats
118
119    def summary(self, format: str = "print") -> None:
120        info = [
121            {
122                "layer": i,
123                "name": filt.name,
124                "doc": filt.__doc__,
125            }
126            for i, filt in enumerate(self.filters)
127        ]
128
129        def to_json(filter_info: dict) -> dict:
130            filter_info["doc"] = "".join(d.strip() for d in filter_info["doc"].split("\n"))
131            return filter_info
132
133        if format == "json":
134            print(json.dumps(list(map(to_json, info)), ensure_ascii=False, indent="\t"))
135        if format == "print":
136            for layer in info:
137                print(f"[{layer['layer']}] {layer['name']}")
138                pprint.pprint(layer["doc"])
class BeforeProcessFilter(hojichar.core.filter_interface.Filter):
14class BeforeProcessFilter(Filter):
15    def apply(self, doc: Document) -> Document:
16        return doc

Base class for all filters. Document-level filters must inherit from this class.

The definition of filter function is in apply method. If you define a new filter, you must define the method. When this class is called, apply the filter from string to string.

If the filter create Document.tokens form Document.text, you must implement tokenize method. If the filter update Document.text by merging Document.tokens, you must implement merge method. Do not define a filter that changes both Document.text and Document.token to prevent unexpected behavior.

If you apply the filter to tokens, you can use TokenFilter class.

Parameters

p: float The probability apply the filter organized by hojichar.Compose skip_reject: bool If set True, hojichar.Compose make this filter ignore the document which has is_rejected flag. This flag is True by default since processing discarded documents in subsequent filters is meaningless. However, in some cases, docs that have been rejected need another filter. For example, analyzing false-positive, discarded docs must be passed to JSON Dump filters. In such case, set the skip_reject flag as False and make it pass all docs.

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
15    def apply(self, doc: Document) -> Document:
16        return doc

Definition of filter behavior.

In this method, the filter will modify document.text, or set document.is_rejected = True to discard the document.

Do not define a filter that changes both document.text and document.token

Parameters

document : Document Input document

Returns

Document Processed Document

class Compose(hojichar.core.filter_interface.Filter):
 19class Compose(Filter):
 20    def __init__(
 21        self,
 22        filters: List[Union[Filter, TokenFilter]],
 23        random_state: Optional[Union[int, np.random.Generator]] = None,
 24        *args: Any,
 25        **kwargs: Any,
 26    ) -> None:
 27        """
 28        Compose a filter from pre-defined filter-objects.
 29        Filter which has `skip_rejected` flag ignores a document which has `is_rejected` flag.
 30        By doing so, Compose avoid applying filters that do not affect the output.
 31
 32        Parameters
 33        ----------
 34        filters : List[Union[Filter, TokenFilter]]
 35            Filter instances which apply to the corpus.
 36
 37        random_state : Union[None, int, np.random.Generator], optional
 38            Default = None
 39            Seed for applying filters randomly.
 40            `random_state` must be int or np.random.Generator instance.
 41        """
 42        super().__init__(*args, **kwargs)
 43        self.set_filters(filters)
 44        self.logger = logging.getLogger("hojichar.Compose")
 45        self.before_process_inspector = Inspector(
 46            target_filter=BeforeProcessFilter(), filter_idx=-1
 47        )
 48        self.inspectors = [
 49            Inspector(target_filter=filter, filter_idx=idx)
 50            for idx, filter in enumerate(self.filters)
 51        ]
 52        self._statistics = StatisticsCounter(self.inspectors)
 53
 54        # Turn random_state into a `np.random.Generator` instance.
 55        if random_state is None:
 56            self.rng = np.random.default_rng()
 57        elif isinstance(random_state, int):
 58            self.rng = np.random.default_rng(random_state)
 59        elif isinstance(random_state, np.random.Generator):
 60            self.rng = random_state
 61        else:
 62            raise ValueError(f"{random_state} cannot be used to seed.")
 63
 64    def set_filters(self, filters: List[Union[Filter, TokenFilter]]) -> None:
 65        """
 66        Set the filter to a Compose object. The filter is expanded if the
 67        list of filters in the argument contains a filter bound by Compose.
 68
 69        Args:
 70            filters (List[Union[Filter, TokenFilter]]): Target filters
 71        """
 72        self.filters: List[Union[Filter, TokenFilter]] = []
 73        for filter in filters:
 74            if isinstance(filter, Compose):
 75                self.filters.extend(filter.filters)
 76            else:
 77                self.filters.append(filter)
 78
 79    def __call__(self, text: str) -> str:
 80        document = Document(text)
 81        document = self.apply(document)
 82        if document.is_rejected:
 83            return ""
 84        else:
 85            return document.text
 86
 87    def _apply_filter(self, filt: Union[Filter, TokenFilter], document: Document) -> Document:
 88        if document.is_rejected and filt.skip_rejected:
 89            pass
 90        else:
 91            if filt.p == 1:
 92                document = filt.apply_filter(document)
 93            else:
 94                if self.rng.random() < filt.p:
 95                    document = filt.apply_filter(document)
 96        return document
 97
 98    def apply(self, document: Document) -> Document:
 99        document = self.before_process_inspector.apply(document)
100        previous_inspector = self.before_process_inspector
101        for i, filt in enumerate(self.filters):
102            inspector = self.inspectors[i]
103            document = self._apply_filter(filt=filt, document=document)
104            document = inspector.apply(document)
105            if (not previous_inspector.is_rejected) and inspector.is_rejected:
106                document.reject_reason = filt.get_jsonalbe_vars(exclude_keys={"skip_rejected"})
107            previous_inspector = inspector
108
109        self._statistics.update_changes(document, self.before_process_inspector, self.inspectors)
110        return document
111
112    @property
113    def statistics(self) -> dict:
114        return self._statistics.get_statistics()
115
116    @property
117    def statistics_obj(self) -> StatsContainer:
118        return self._statistics.stats
119
120    def summary(self, format: str = "print") -> None:
121        info = [
122            {
123                "layer": i,
124                "name": filt.name,
125                "doc": filt.__doc__,
126            }
127            for i, filt in enumerate(self.filters)
128        ]
129
130        def to_json(filter_info: dict) -> dict:
131            filter_info["doc"] = "".join(d.strip() for d in filter_info["doc"].split("\n"))
132            return filter_info
133
134        if format == "json":
135            print(json.dumps(list(map(to_json, info)), ensure_ascii=False, indent="\t"))
136        if format == "print":
137            for layer in info:
138                print(f"[{layer['layer']}] {layer['name']}")
139                pprint.pprint(layer["doc"])

Base class for all filters. Document-level filters must inherit from this class.

The definition of filter function is in apply method. If you define a new filter, you must define the method. When this class is called, apply the filter from string to string.

If the filter create Document.tokens form Document.text, you must implement tokenize method. If the filter update Document.text by merging Document.tokens, you must implement merge method. Do not define a filter that changes both Document.text and Document.token to prevent unexpected behavior.

If you apply the filter to tokens, you can use TokenFilter class.

Parameters

p: float The probability apply the filter organized by hojichar.Compose skip_reject: bool If set True, hojichar.Compose make this filter ignore the document which has is_rejected flag. This flag is True by default since processing discarded documents in subsequent filters is meaningless. However, in some cases, docs that have been rejected need another filter. For example, analyzing false-positive, discarded docs must be passed to JSON Dump filters. In such case, set the skip_reject flag as False and make it pass all docs.

Compose( filters: List[Union[hojichar.core.filter_interface.Filter, hojichar.core.filter_interface.TokenFilter]], random_state: Union[int, numpy.random._generator.Generator, NoneType] = None, *args: Any, **kwargs: Any)
20    def __init__(
21        self,
22        filters: List[Union[Filter, TokenFilter]],
23        random_state: Optional[Union[int, np.random.Generator]] = None,
24        *args: Any,
25        **kwargs: Any,
26    ) -> None:
27        """
28        Compose a filter from pre-defined filter-objects.
29        Filter which has `skip_rejected` flag ignores a document which has `is_rejected` flag.
30        By doing so, Compose avoid applying filters that do not affect the output.
31
32        Parameters
33        ----------
34        filters : List[Union[Filter, TokenFilter]]
35            Filter instances which apply to the corpus.
36
37        random_state : Union[None, int, np.random.Generator], optional
38            Default = None
39            Seed for applying filters randomly.
40            `random_state` must be int or np.random.Generator instance.
41        """
42        super().__init__(*args, **kwargs)
43        self.set_filters(filters)
44        self.logger = logging.getLogger("hojichar.Compose")
45        self.before_process_inspector = Inspector(
46            target_filter=BeforeProcessFilter(), filter_idx=-1
47        )
48        self.inspectors = [
49            Inspector(target_filter=filter, filter_idx=idx)
50            for idx, filter in enumerate(self.filters)
51        ]
52        self._statistics = StatisticsCounter(self.inspectors)
53
54        # Turn random_state into a `np.random.Generator` instance.
55        if random_state is None:
56            self.rng = np.random.default_rng()
57        elif isinstance(random_state, int):
58            self.rng = np.random.default_rng(random_state)
59        elif isinstance(random_state, np.random.Generator):
60            self.rng = random_state
61        else:
62            raise ValueError(f"{random_state} cannot be used to seed.")

Compose a filter from pre-defined filter-objects. Filter which has skip_rejected flag ignores a document which has is_rejected flag. By doing so, Compose avoid applying filters that do not affect the output.

Parameters

filters : List[Union[Filter, TokenFilter]] Filter instances which apply to the corpus.

random_state : Union[None, int, np.random.Generator], optional Default = None Seed for applying filters randomly. random_state must be int or np.random.Generator instance.

def set_filters( self, filters: List[Union[hojichar.core.filter_interface.Filter, hojichar.core.filter_interface.TokenFilter]]) -> None:
64    def set_filters(self, filters: List[Union[Filter, TokenFilter]]) -> None:
65        """
66        Set the filter to a Compose object. The filter is expanded if the
67        list of filters in the argument contains a filter bound by Compose.
68
69        Args:
70            filters (List[Union[Filter, TokenFilter]]): Target filters
71        """
72        self.filters: List[Union[Filter, TokenFilter]] = []
73        for filter in filters:
74            if isinstance(filter, Compose):
75                self.filters.extend(filter.filters)
76            else:
77                self.filters.append(filter)

Set the filter to a Compose object. The filter is expanded if the list of filters in the argument contains a filter bound by Compose.

Args: filters (List[Union[Filter, TokenFilter]]): Target filters

def apply( self, document: hojichar.core.models.Document) -> hojichar.core.models.Document:
 98    def apply(self, document: Document) -> Document:
 99        document = self.before_process_inspector.apply(document)
100        previous_inspector = self.before_process_inspector
101        for i, filt in enumerate(self.filters):
102            inspector = self.inspectors[i]
103            document = self._apply_filter(filt=filt, document=document)
104            document = inspector.apply(document)
105            if (not previous_inspector.is_rejected) and inspector.is_rejected:
106                document.reject_reason = filt.get_jsonalbe_vars(exclude_keys={"skip_rejected"})
107            previous_inspector = inspector
108
109        self._statistics.update_changes(document, self.before_process_inspector, self.inspectors)
110        return document

Definition of filter behavior.

In this method, the filter will modify document.text, or set document.is_rejected = True to discard the document.

Do not define a filter that changes both document.text and document.token

Parameters

document : Document Input document

Returns

Document Processed Document

def summary(self, format: str = 'print') -> None:
120    def summary(self, format: str = "print") -> None:
121        info = [
122            {
123                "layer": i,
124                "name": filt.name,
125                "doc": filt.__doc__,
126            }
127            for i, filt in enumerate(self.filters)
128        ]
129
130        def to_json(filter_info: dict) -> dict:
131            filter_info["doc"] = "".join(d.strip() for d in filter_info["doc"].split("\n"))
132            return filter_info
133
134        if format == "json":
135            print(json.dumps(list(map(to_json, info)), ensure_ascii=False, indent="\t"))
136        if format == "print":
137            for layer in info:
138                print(f"[{layer['layer']}] {layer['name']}")
139                pprint.pprint(layer["doc"])