Source code for tlo.logging.reader

import json
import logging as _logging
from collections import defaultdict
from typing import Any, DefaultDict, Dict, List

import numpy as np
import pandas as pd



[docs]
class LogData:
    """Builds up log data for export as dictionary with dataframes"""

    def __init__(self):
        self.data: DefaultDict[str, Dict[str, Dict[str, Any]]] = defaultdict(dict)
        self.allowed_logs = set()
        self.uuid_to_module = dict()


[docs]
    def parse_log_line(self, log_line: str, level: int):
        """
        Parse LogRow at desired level

        :param log_line: a json line from log file that can either be a header or data row
        :param level: matching level to add to log, other levels will not be added
        """
        # new header line, if this is the right level, then add module and key to log with header and blank data
        log_data = json.loads(log_line)

        if 'type' in log_data and log_data['type'] == 'header':
            self.uuid_to_module[log_data['uuid']] = log_id = (log_data['module'], log_data['key'])
            if getattr(_logging, log_data['level']) >= level:
                self.allowed_logs.add(log_id)
                self.data[log_data['module']][log_data['key']] = {'header': log_data, 'values': [], 'dates': []}
        else:
            log_id = (log_data['module'], log_data['key']) = self.uuid_to_module[log_data['uuid']]
            # log data row if we allow this logger
            if log_id in self.allowed_logs:
                self.data[log_data['module']][log_data['key']]['dates'].append(log_data['date'])
                self.data[log_data['module']][log_data['key']]['values'].append(log_data['values'])



[docs]
    def get_log_dataframes(self) -> DefaultDict[str, Dict[str, pd.DataFrame]]:
        """
        Converts parsed logs of dictionaries to dataframes and then returns all logs

        :return: dictionary of output logs with dataframes for each log key
        """
        output_logs: DefaultDict[str, Dict[str, pd.DataFrame]] = defaultdict(dict)

        for module, log_data in self.data.items():
            output_logs['_metadata'][module] = dict()
            for key, data in log_data.items():
                output_logs['_metadata'][module][key] = data['header']
                if list(data['header']['columns'].keys()) == ['dataframe']:
                    output_logs[module][key] = self.parse_logged_dataframe(data['values'], data['dates'])
                else:
                    output_logs[module][key] = pd.DataFrame(data['values'], columns=data['header']['columns'].keys())
                    output_logs[module][key].insert(
                        0, "date", pd.Series(data["dates"], dtype=np.dtype('datetime64[ns]'))
                    )
                # for each column, cast to the correct type if necessary
                for n, t in data['header']['columns'].items():
                    if t == "Timestamp":
                        output_logs[module][key][n] = output_logs[module][key][n].astype('datetime64[ns]')
                    elif t == "Categorical":
                        output_logs[module][key][n] = output_logs[module][key][n].astype('category')
                    elif t == "set":
                        output_logs[module][key][n] = output_logs[module][key][n].apply(set)
        return output_logs



[docs]
    def parse_logged_dataframe(self, values: List[List[Dict[str, Dict[str, Any]]]], dates: List[str]) -> pd.DataFrame:
        """
        Converts log data for an entire dataframe being logged into a mutli-indexed dataframe
        :param values: logged values
        :param dates: list of dates
        :return: Multi-indexed (log_row, df_row) dataframe
        """
        # Convert data to {(log_row_i, df_row_i): {df_col_name_1: df_col_val_1, df_col_name_2: df_col_val_2...}}
        indexed_data = {(log_row_i, df_row_i): log_row[df_row_i]
                        # Each log row as the first part of multi-index
                        for log_row_i in range(len(values))
                        # each row is a list with one dictionary as the value
                        for log_row in values[0]
                        # the index for each row of the logged dataframe
                        for df_row_i in log_row.keys()}
        # create dataframe from indexed data, and join dates based on the log row index
        log_date = pd.DataFrame(pd.Series(dates, name='date', dtype=np.dtype('datetime64[ns]')))
        log_date.index.set_names("log_row", inplace=True)
        logged_df = pd.DataFrame.from_dict(indexed_data, orient='index')
        logged_df.index.set_names(['log_row', 'df_row'], inplace=True)
        return log_date.join(logged_df)