Source code for kedro_partitioned.pipeline.decorators.helper_factory

"""Function factories for creating filters."""
import re
from re import Pattern

import pandas as pd
from kedro_partitioned.utils.typing import IsFunction

DATE_FORMAT_ISO = '%Y-%m-%d'


[docs]def not_filter(fn: IsFunction[str]) -> IsFunction[str]: """Inverts a function return. Args: fn (IsFunction[str]) Returns: IsFunction[str] Example: >>> def fn(x: str) -> bool: return x == 'a' >>> nfn = not_filter(fn) >>> fn('a') True >>> fn('b') False >>> nfn('a') False >>> nfn('b') True """ def filter(path: str) -> bool: return not fn(path) return filter
[docs]def regex_filter(pattern: str) -> IsFunction[str]: """Converts a regex pattern into a boolean filter. Args: pattern (str) Returns: IsFunction[str]: A function that takes a string as input and checks if it matches a pattern, if matches return True, otherwise False. Example: >>> fn = regex_filter(r'ab?') >>> fn('ad') True >>> fn('ab') True >>> fn('db') False """ regex = re.compile(pattern) def filter_fn(x: str) -> bool: return bool(regex.search(x)) return filter_fn
def _date_format_to_regex(format: str) -> Pattern: r"""Converts a date format pattern into a regex. Args: format (str) Returns: Pattern Example: >>> _date_format_to_regex('%Y-%m-%d') re.compile('\\d{4}-((0[0-9])|(1[0-2]))-(([0-2][0-9])|(3[0-1]))') >>> _date_format_to_regex('%d/%m/%Y') re.compile('(([0-2][0-9])|(3[0-1]))\\/((0[0-9])|(1[0-2]))\\/\\d{4}') """ return re.compile( format.replace('/', r'\/').replace('%Y', r'\d{4}').replace( '%m', r'((0[0-9])|(1[0-2]))').replace('%d', r'(([0-2][0-9])|(3[0-1]))'))
[docs]def date_range_filter(min_date: str = pd.Timestamp.min, max_date: str = pd.Timestamp.max, format: str = DATE_FORMAT_ISO) -> IsFunction[str]: """Generates a date_range filter function. Args: min_date (str, optional): Defaults to pd.Timestamp.min. max_date (str, optional): Defaults to pd.Timestamp.max. format (str, optional): Defaults to DATE_FORMAT_ISO. Returns: IsFunction[str]: A function that takes a string as input, extracts a regex match of the specified format and returns True whether a string is in date range (both inclusive), otherwise False. Example: >>> upper = date_range_filter(max_date='2020-02-02', format='%Y-%m-%d') >>> upper('2020-02-03') False >>> upper('2020-02-02') True >>> upper('random/string/2020-01-01/suffix') True >>> lower = date_range_filter(min_date='20200202', format='%Y%m%d') >>> lower('20200201') False >>> lower('20200202') True >>> lower('random/string/20200301/suffix') True >>> bounded = date_range_filter(min_date='02-02/2020', ... max_date='02-04/2020', format='%d-%m/%Y') >>> bounded('prefix/02-04/2020/suffix/') True >>> bounded('prefix-02-02/2020-suffix') True >>> bounded('prefix/24-03/2020-suffix') True >>> bounded('prefix-13-01/2020/suffix') False >>> bounded('prefix25-05/2020suffix') False Warning: the specified format format must not be ambiguous. for example, if a string format is '%Y%m%d', and another number of 8 digits appear, it will be recognized as date. if it is a path for example, this can be avoided by using '/%Y%m%d/' as the format pattern """ pd_min = pd.to_datetime(min_date, format=format) pd_max = pd.to_datetime(max_date, format=format) regex = _date_format_to_regex(format) def filter(path: str) -> bool: match = re.search(regex, path) if match: str_date = match[0] pd_date = pd.to_datetime(str_date, format=format) return pd_date >= pd_min and pd_date <= pd_max return False return filter