Source code for bento.util

import pandas as pd
import numpy as np
import math
import plotly.express as px

from collections import defaultdict
from bento.common import logger, logutil, dictutil  # noqa

logging = logger.fancy_logger(__name__)


[docs]def desnake(text): """Turns underscores into spaces""" return str(text).strip().replace("_", " ")
def titlize(text): return desnake(text.title()) def get_unit(num): scale = ["f", "n", "u", "m", "", "K", "M", "G", "T", "P"] offset = 4 try: exp = min(9, offset + math.log10(num) / 3) if exp < 0: return num, "" exp = int(exp) sig = round(num / 10 ** ((exp - 4) * 3), 2) return sig, scale[exp] except Exception: return num, "" # @logutil.loginfo(level="debug") def prepare_transforms(inputs, dep_var="y"): transforms = [] dep_column = dictutil.extract_unique(f"{dep_var}_column", inputs, pop=False) for key, value in dictutil.extract("_transform", inputs, pop=False).items(): if not value: continue if "window" in key: transforms.append((dep_column, ["rolling", "mean"], [(value,), ()])) elif "calc" in key: if value == "Rate": transforms.append((dep_column, ["diff"], [()])) elif value == "Acceleration": transforms.append((dep_column, ["diff", "diff"], [(), ()])) elif value == "Cumulative": transforms.append((dep_column, ["cumsum"], [()])) elif "norm" in key: if value == "Max": transforms.append((dep_column, ["div"], [("trace.max",)])) elif value == "Other Series": transforms.append((dep_column, ["div"], [("ref.0",)])) return transforms # @logutil.loginfo(level="debug") def prepare_filters(inputs): filters = defaultdict(lambda: defaultdict(list)) logic = dictutil.extract_unique("filter_logic", inputs, default="or").lower() for key, values in dictutil.extract("_filter", inputs, pop=False).items(): col = key.replace("_filter", "") if not values: continue if not isinstance(values, list): values = [values] if "date" in key and len(values) == 2: filters["between"][col].extend(values) else: filters[logic][col].extend(values) return filters # NOTE Currently used for pie charts and ranking # @logutil.loginfo(level='debug') def filter_df(idf, filters): odf = idf for logic, columns in filters.items(): for column, values in columns.items(): if "datetime" in str(type(values[0])): values = [np.datetime64(item) for item in values] if logic == "between": # TODO generalize filters to handle types try: odf = odf[(odf[column] >= values[0]) & (odf[column] <= values[1])] except TypeError: odf = odf[ (odf[column].astype(int) >= values[0]) & (odf[column].astype(int) <= values[1]) # noqa ] elif logic == "or": odf = odf[odf[column].isin(values)] elif logic == "and": odf = odf[odf[column].isin(values)] return odf # @logutil.loginfo(level="debug") # TODO Figure out a way around this hack, which manually filters out None strings as # a substitute for properly dealing with bipartite dataframes def rank(idf, key, text_key, column, count=10, **kwargs): fdf = idf.groupby(key).sum().reset_index() fdf = fdf[fdf[text_key] != "None"] fdf = fdf.nlargest(count, column) return zip(fdf[text_key], fdf[column]) def apply_defaults(component_type, raw_inputs, data): inputs = {"variant": component_type.split(".")[-1]} inputs.update(raw_inputs) if not component_type.startswith("graph.normal"): logging.debug("No defaults implemented for non-normal graph components") return inputs if "x_column" not in inputs: possible = [col for col in data["df"].columns if "date" in col] inputs["x_column"] = possible[0] inputs["x_scale"] = "date" logging.info(f"Graph x_column missing, setting to '{possible[0]}'") logging.info(f" ...also setting x_scale to 'date'") if "y_column" not in inputs: possible = [ key for key, val in data["types"].items() if "date" not in key and "date" != val ] inputs["y_column"] = possible[0] logging.info(f"Graph y_column missing, setting to '{possible[0]}'") return inputs # NOTE Used for preparing the traces for graphs # TODO Should combine this with filter_df/ # @logutil.loginfo(level="debug") def prepare_traces(idf, filters, key_columns): # NOTE Brought over from figure callback, default multi-column approach # TODO Figure out how to determine default columns from df # column = self.data.get("keys", self.data["columns"][0])[0] # def_x_column = self.data["columns"][1] # idf.groupby(column).max().reset_index().nlargest(8, def_x_column)[column] idf["label"] = "" idf.name = "" traces = [idf] for logic, columns in filters.items(): new_traces = [] if logic == "between": for column, values in columns.items(): for df in traces: # TODO generalize filters to handle types try: new = df[(df[column] >= values[0]) & (df[column] <= values[1])] except TypeError: new = df[ (df[column].astype(int) >= values[0]) & (df[column].astype(int) <= values[1]) # noqa ] new.name = df.name new_traces.append(new) elif logic == "or": for column, values in columns.items(): for df in traces: for value in values: new = df[df[column] == value] new.name = df.name try: new.name += " " + value except Exception: # try: new.name += " " + pd.to_datetime(value).strftime("%Y-%m-%d") # except Exception: # logging.warning(f"Can't add {column} to trace name") new_traces.append(new) elif logic == "and": for column, values in columns.items(): for df in traces: for value in values: new = df[df[column] == value] new.name = df.name try: new.name += " " + value except Exception: # try: new.name += " " + pd.to_datetime(value).strftime("%Y-%m-%d") # except Exception: # logging.warning(f"Can't add {column} to trace name") new_traces.append(new) traces = new_traces new_traces = [] if key_columns: for df in traces: new = df.groupby(key_columns).sum().reset_index() new.name = df.name new_traces.append(new) else: new_traces = traces traces = new_traces return traces # @logutil.loginfo(level="debug") def trace_analytics(traces, transforms): for transform in transforms: column, operations, arg_list = transform for trace in traces: buff = trace[column] for op, args in zip(operations, arg_list): final_args = [] for arg in args: if "trace" in str(arg): oper = arg.split(".")[-1] final_args.append(getattr(buff, oper)()) elif "ref" in str(arg): idx = 0 final_args.append(traces[idx][column]) else: final_args.append(arg) buff = getattr(buff, op)(*final_args) trace[column] = buff return traces # @logutil.loginfo(level="debug") def aggregate(idf, y_column=None, filters=None, logic="sum", keys=None, **kwargs): filters = filters or {} filters.update(kwargs.get("fixed_filters", {})) # TODO Plenty of work to do cleaning up the data processing utilities like this keys = keys or ["date"] traces = prepare_traces(idf, filters, keys) agg_df = pd.concat(traces) # NOTE Pay attention to this block for multi-axis support if not y_column: return len(agg_df), "" elif isinstance(y_column, list): y_column = y_column[0] quantity = getattr(agg_df[y_column], logic)() return get_unit(quantity) def _date_marks(ordered): spacing = 7 style = {} labels = {item: pd.Timestamp(item).date().day for item in ordered[::spacing]} marks = { int(key): {"label": label, "style": style} for key, label in labels.items() } return marks def gen_marks(series, variant="auto"): """Processes a dataframe column into a valid slider series""" ordered = sorted(series) spacing = math.ceil(len(ordered) / 10) if variant == "date": marks = _date_marks(ordered) else: marks = {item: str(item) for item in series[::spacing]} return marks def gen_options(option_input, multi=False, default=None): # In this case, we're given just the set of options only, assuming first is default if isinstance(option_input, list): option_list = option_input base_default = [] if multi else option_list[0] default = default if default is not None else base_default # The default may be specified in the dict version elif isinstance(option_input, dict): # TODO Make this more robust if "value" in option_input: return option_input option_list = option_input["options"] base_default = [] if multi else option_list[0] default = ( default if default is not None else option_input.get("default", base_default) ) else: logging.warning(f"Unsupported type {type(option_input)} for options") logging.debug(option_input) # TODO Can we determine when we should run desnake on the entries? options = [{"label": desnake(item).title(), "value": item} for item in option_list] return {"options": options, "value": default} def get_first_numeric(data_types): for dtype in data_types: if "log" in data_types[dtype]: return dtype def log_color_scale(name, base=2.718, category="sequential"): color_category = getattr(px.colors, category) color_sequence = getattr(color_category, name) # print(color_sequence) log_val = [round(1 / base ** idx, 10) for idx in range(len(color_sequence))][::-1] log_val[0] = 0 log_sequence = list(zip(log_val, color_sequence)) return log_sequence # @logutil.loginfo(level='debug') def data_range(pd_df_list, column, scale="category"): minimum = pd_df_list[0][column].min() maximum = pd_df_list[0][column].max() for df in pd_df_list: series = df[column] minimum = min(minimum, series.min()) maximum = max(maximum, series.max()) if scale == "log": minimum = max(0.1, minimum) minimum = np.floor(max(-1, np.log10(minimum))) maximum = np.ceil(np.log10(maximum)) elif scale == "linear": minimum = np.floor(minimum) maximum = np.ceil(maximum) return [minimum, maximum] if __name__ == "__main__": log_color_scale("Viridis", base=10) for i in range(-18, 19): unit = "Hz" sig, scale = get_unit(10 ** i) print(f"{sig} {scale}{unit}")