hojichar.core.filter_interface

  1import logging
  2from typing import Any, Dict, Optional, Set
  3
  4from hojichar.core.models import Document, Token
  5
  6
  7def _is_jsonable(data: Any) -> bool:
  8    if data is None:
  9        return True
 10    elif isinstance(data, (bool, int, float, str)):
 11        return True
 12    """
 13    elif isinstance(data, (tuple, list)):
 14        return all(Filter._is_jsonable(x) for x in data)
 15    elif isinstance(data, dict):
 16        return all(isinstance(k, str) and Filter._is_jsonable(v) for k, v in data.items())
 17    """
 18    return False
 19
 20
 21class Filter:
 22    """
 23    Base class for all filters.
 24    Document-level filters must inherit from this class.
 25
 26    The definition of filter function is in `apply` method.
 27    If you define a new filter, you must define the method.
 28    When this class is called, apply the filter from string to string.
 29
 30    If the filter create `Document.tokens` form `Document.text`, you
 31    must implement `tokenize` method.
 32    If the filter update `Document.text` by merging `Document.tokens`, you
 33    must implement `merge` method.
 34    Do not define a filter that changes both `Document.text` and `Document.token`
 35    to prevent unexpected behavior.
 36
 37    If you apply the filter to tokens, you can use `TokenFilter` class.
 38
 39    Parameters
 40    ----------
 41    p: float
 42        The probability apply the filter organized by hojichar.Compose
 43    skip_reject: bool
 44        If set `True`, `hojichar.Compose` make this filter ignore the document
 45        which has `is_rejected` flag.
 46        This flag is `True` by default since processing discarded documents
 47        in subsequent filters is meaningless. However, in some cases, docs that
 48        have been rejected need another filter. For example, analyzing false-positive,
 49        discarded docs must be passed to JSON Dump filters. In such case,
 50        set the `skip_reject` flag as `False` and make it pass all docs.
 51    """
 52
 53    def __init__(
 54        self, p: float = 1, skip_rejected: bool = True, *args: Any, **kwargs: Any
 55    ) -> None:
 56        """
 57        Parameters
 58        ----------
 59        p : float, optional
 60            Probability that this filter will be applied. Default=1
 61        """
 62        self.name = self.__class__.__name__
 63        self.logger = logging.getLogger("hojichar.document_filters." + self.name)
 64        assert 0 <= p <= 1
 65        self.p = p
 66        self.skip_rejected = skip_rejected
 67
 68    def apply(self, document: Document) -> Document:
 69        """Definition of filter behavior.
 70
 71        In this method, the filter will modify `document.text`, or
 72        set `document.is_rejected = True` to discard the document.
 73
 74        Do not define a filter that changes both `document.text` and `document.token`
 75
 76        Parameters
 77        ----------
 78        document : Document
 79            Input document
 80
 81        Returns
 82        -------
 83        Document
 84            Processed Document
 85        """
 86        raise NotImplementedError(f"{self.__class__.__name__}.apply method is not defined")
 87        return document
 88
 89    def apply_filter(self, document: Document) -> Document:
 90        document = self.apply(document)
 91        return document
 92
 93    def __call__(self, text: str) -> str:
 94        document = Document(text)
 95        document = self.apply(document)
 96        return document.text
 97
 98    def get_jsonalbe_vars(self, exclude_keys: Optional[Set[str]] = None) -> Dict[str, Any]:
 99        """
100        Get the member variable of this filter.
101        Eligible variables are primitive types; [bool, int, float, str, None],
102        and the name of the variable not starts with the underscore; `_`.
103        """
104        if exclude_keys is None:
105            exclude_keys = set()
106        return {
107            k: v
108            for k, v in vars(self).items()
109            if (_is_jsonable(v) and (k not in exclude_keys) and (not k.startswith("_")))
110        }
111
112
113class TokenFilter:
114    """
115    Base class for token-level filters.
116
117    Token filters, which shuld be implemented in hojichar/filters/token_filters.py,
118    must inherit from this class.
119    """
120
121    def __init__(
122        self, p: float = 1, skip_rejected: bool = True, *args: Any, **kwargs: Any
123    ) -> None:
124        self.name = self.__class__.__name__
125        self.logger = logging.getLogger("hojichar.token_filters." + self.name)
126        assert 0 <= p <= 1
127        self.p = p
128        self.skip_rejected = skip_rejected
129
130    def apply(self, token: Token) -> Token:
131        raise NotImplementedError(f"{self.__class__.__name__}.apply method is not defined")
132        return token
133
134    def apply_filter(self, document: Document) -> Document:
135        document.tokens = [self.apply(token) for token in document.tokens if not token.is_rejected]
136        return document
137
138    def __call__(self, text: str) -> str:
139        token = Token(text)
140        token = self.apply(token)
141        return token.text
142
143    def get_jsonable_vars(self) -> dict:
144        # Output key-values of member variables that can be obtained by var(self), except "logger".
145        exclude_keys = ["logger"]
146        return dict(filter(lambda item: item[0] not in exclude_keys, vars(self).items()))
147
148    def get_jsonalbe_vars(self, exclude_keys: Optional[Set[str]] = None) -> dict:
149        """
150        Get the member variable of this filter.
151        Eligible variables are primitive types; [bool, int, float, str, None],
152        and the name of the variable not starts with the underscore; `_`.
153        """
154        if exclude_keys is None:
155            exclude_keys = set()
156        return {
157            k: v
158            for k, v in vars(self).items()
159            if (_is_jsonable(v) and (k not in exclude_keys) and (not k.startswith("_")))
160        }
class Filter:
 22class Filter:
 23    """
 24    Base class for all filters.
 25    Document-level filters must inherit from this class.
 26
 27    The definition of filter function is in `apply` method.
 28    If you define a new filter, you must define the method.
 29    When this class is called, apply the filter from string to string.
 30
 31    If the filter create `Document.tokens` form `Document.text`, you
 32    must implement `tokenize` method.
 33    If the filter update `Document.text` by merging `Document.tokens`, you
 34    must implement `merge` method.
 35    Do not define a filter that changes both `Document.text` and `Document.token`
 36    to prevent unexpected behavior.
 37
 38    If you apply the filter to tokens, you can use `TokenFilter` class.
 39
 40    Parameters
 41    ----------
 42    p: float
 43        The probability apply the filter organized by hojichar.Compose
 44    skip_reject: bool
 45        If set `True`, `hojichar.Compose` make this filter ignore the document
 46        which has `is_rejected` flag.
 47        This flag is `True` by default since processing discarded documents
 48        in subsequent filters is meaningless. However, in some cases, docs that
 49        have been rejected need another filter. For example, analyzing false-positive,
 50        discarded docs must be passed to JSON Dump filters. In such case,
 51        set the `skip_reject` flag as `False` and make it pass all docs.
 52    """
 53
 54    def __init__(
 55        self, p: float = 1, skip_rejected: bool = True, *args: Any, **kwargs: Any
 56    ) -> None:
 57        """
 58        Parameters
 59        ----------
 60        p : float, optional
 61            Probability that this filter will be applied. Default=1
 62        """
 63        self.name = self.__class__.__name__
 64        self.logger = logging.getLogger("hojichar.document_filters." + self.name)
 65        assert 0 <= p <= 1
 66        self.p = p
 67        self.skip_rejected = skip_rejected
 68
 69    def apply(self, document: Document) -> Document:
 70        """Definition of filter behavior.
 71
 72        In this method, the filter will modify `document.text`, or
 73        set `document.is_rejected = True` to discard the document.
 74
 75        Do not define a filter that changes both `document.text` and `document.token`
 76
 77        Parameters
 78        ----------
 79        document : Document
 80            Input document
 81
 82        Returns
 83        -------
 84        Document
 85            Processed Document
 86        """
 87        raise NotImplementedError(f"{self.__class__.__name__}.apply method is not defined")
 88        return document
 89
 90    def apply_filter(self, document: Document) -> Document:
 91        document = self.apply(document)
 92        return document
 93
 94    def __call__(self, text: str) -> str:
 95        document = Document(text)
 96        document = self.apply(document)
 97        return document.text
 98
 99    def get_jsonalbe_vars(self, exclude_keys: Optional[Set[str]] = None) -> Dict[str, Any]:
100        """
101        Get the member variable of this filter.
102        Eligible variables are primitive types; [bool, int, float, str, None],
103        and the name of the variable not starts with the underscore; `_`.
104        """
105        if exclude_keys is None:
106            exclude_keys = set()
107        return {
108            k: v
109            for k, v in vars(self).items()
110            if (_is_jsonable(v) and (k not in exclude_keys) and (not k.startswith("_")))
111        }

Base class for all filters. Document-level filters must inherit from this class.

The definition of filter function is in apply method. If you define a new filter, you must define the method. When this class is called, apply the filter from string to string.

If the filter create Document.tokens form Document.text, you must implement tokenize method. If the filter update Document.text by merging Document.tokens, you must implement merge method. Do not define a filter that changes both Document.text and Document.token to prevent unexpected behavior.

If you apply the filter to tokens, you can use TokenFilter class.

Parameters

p: float The probability apply the filter organized by hojichar.Compose skip_reject: bool If set True, hojichar.Compose make this filter ignore the document which has is_rejected flag. This flag is True by default since processing discarded documents in subsequent filters is meaningless. However, in some cases, docs that have been rejected need another filter. For example, analyzing false-positive, discarded docs must be passed to JSON Dump filters. In such case, set the skip_reject flag as False and make it pass all docs.

Filter(p: float = 1, skip_rejected: bool = True, *args: Any, **kwargs: Any)
54    def __init__(
55        self, p: float = 1, skip_rejected: bool = True, *args: Any, **kwargs: Any
56    ) -> None:
57        """
58        Parameters
59        ----------
60        p : float, optional
61            Probability that this filter will be applied. Default=1
62        """
63        self.name = self.__class__.__name__
64        self.logger = logging.getLogger("hojichar.document_filters." + self.name)
65        assert 0 <= p <= 1
66        self.p = p
67        self.skip_rejected = skip_rejected

Parameters

p : float, optional Probability that this filter will be applied. Default=1

def apply( self, document: hojichar.core.models.Document) -> hojichar.core.models.Document:
69    def apply(self, document: Document) -> Document:
70        """Definition of filter behavior.
71
72        In this method, the filter will modify `document.text`, or
73        set `document.is_rejected = True` to discard the document.
74
75        Do not define a filter that changes both `document.text` and `document.token`
76
77        Parameters
78        ----------
79        document : Document
80            Input document
81
82        Returns
83        -------
84        Document
85            Processed Document
86        """
87        raise NotImplementedError(f"{self.__class__.__name__}.apply method is not defined")
88        return document

Definition of filter behavior.

In this method, the filter will modify document.text, or set document.is_rejected = True to discard the document.

Do not define a filter that changes both document.text and document.token

Parameters

document : Document Input document

Returns

Document Processed Document

def apply_filter( self, document: hojichar.core.models.Document) -> hojichar.core.models.Document:
90    def apply_filter(self, document: Document) -> Document:
91        document = self.apply(document)
92        return document
def get_jsonalbe_vars(self, exclude_keys: Optional[Set[str]] = None) -> Dict[str, Any]:
 99    def get_jsonalbe_vars(self, exclude_keys: Optional[Set[str]] = None) -> Dict[str, Any]:
100        """
101        Get the member variable of this filter.
102        Eligible variables are primitive types; [bool, int, float, str, None],
103        and the name of the variable not starts with the underscore; `_`.
104        """
105        if exclude_keys is None:
106            exclude_keys = set()
107        return {
108            k: v
109            for k, v in vars(self).items()
110            if (_is_jsonable(v) and (k not in exclude_keys) and (not k.startswith("_")))
111        }

Get the member variable of this filter. Eligible variables are primitive types; [bool, int, float, str, None], and the name of the variable not starts with the underscore; _.

class TokenFilter:
114class TokenFilter:
115    """
116    Base class for token-level filters.
117
118    Token filters, which shuld be implemented in hojichar/filters/token_filters.py,
119    must inherit from this class.
120    """
121
122    def __init__(
123        self, p: float = 1, skip_rejected: bool = True, *args: Any, **kwargs: Any
124    ) -> None:
125        self.name = self.__class__.__name__
126        self.logger = logging.getLogger("hojichar.token_filters." + self.name)
127        assert 0 <= p <= 1
128        self.p = p
129        self.skip_rejected = skip_rejected
130
131    def apply(self, token: Token) -> Token:
132        raise NotImplementedError(f"{self.__class__.__name__}.apply method is not defined")
133        return token
134
135    def apply_filter(self, document: Document) -> Document:
136        document.tokens = [self.apply(token) for token in document.tokens if not token.is_rejected]
137        return document
138
139    def __call__(self, text: str) -> str:
140        token = Token(text)
141        token = self.apply(token)
142        return token.text
143
144    def get_jsonable_vars(self) -> dict:
145        # Output key-values of member variables that can be obtained by var(self), except "logger".
146        exclude_keys = ["logger"]
147        return dict(filter(lambda item: item[0] not in exclude_keys, vars(self).items()))
148
149    def get_jsonalbe_vars(self, exclude_keys: Optional[Set[str]] = None) -> dict:
150        """
151        Get the member variable of this filter.
152        Eligible variables are primitive types; [bool, int, float, str, None],
153        and the name of the variable not starts with the underscore; `_`.
154        """
155        if exclude_keys is None:
156            exclude_keys = set()
157        return {
158            k: v
159            for k, v in vars(self).items()
160            if (_is_jsonable(v) and (k not in exclude_keys) and (not k.startswith("_")))
161        }

Base class for token-level filters.

Token filters, which shuld be implemented in hojichar/filters/token_filters.py, must inherit from this class.

TokenFilter(p: float = 1, skip_rejected: bool = True, *args: Any, **kwargs: Any)
122    def __init__(
123        self, p: float = 1, skip_rejected: bool = True, *args: Any, **kwargs: Any
124    ) -> None:
125        self.name = self.__class__.__name__
126        self.logger = logging.getLogger("hojichar.token_filters." + self.name)
127        assert 0 <= p <= 1
128        self.p = p
129        self.skip_rejected = skip_rejected
def apply(self, token: hojichar.core.models.Token) -> hojichar.core.models.Token:
131    def apply(self, token: Token) -> Token:
132        raise NotImplementedError(f"{self.__class__.__name__}.apply method is not defined")
133        return token
def apply_filter( self, document: hojichar.core.models.Document) -> hojichar.core.models.Document:
135    def apply_filter(self, document: Document) -> Document:
136        document.tokens = [self.apply(token) for token in document.tokens if not token.is_rejected]
137        return document
def get_jsonable_vars(self) -> dict:
144    def get_jsonable_vars(self) -> dict:
145        # Output key-values of member variables that can be obtained by var(self), except "logger".
146        exclude_keys = ["logger"]
147        return dict(filter(lambda item: item[0] not in exclude_keys, vars(self).items()))
def get_jsonalbe_vars(self, exclude_keys: Optional[Set[str]] = None) -> dict:
149    def get_jsonalbe_vars(self, exclude_keys: Optional[Set[str]] = None) -> dict:
150        """
151        Get the member variable of this filter.
152        Eligible variables are primitive types; [bool, int, float, str, None],
153        and the name of the variable not starts with the underscore; `_`.
154        """
155        if exclude_keys is None:
156            exclude_keys = set()
157        return {
158            k: v
159            for k, v in vars(self).items()
160            if (_is_jsonable(v) and (k not in exclude_keys) and (not k.startswith("_")))
161        }

Get the member variable of this filter. Eligible variables are primitive types; [bool, int, float, str, None], and the name of the variable not starts with the underscore; _.