Source code for posydon.visualization.VH_diagram.ParseDataFrame

"""Counting distincts occurence of binary simulation from a file"""

__authors__ = [
    "Wène Kouarfate <Wene.Kouarfate@etu.unige.ch>"
]

import numpy as np
import pandas as pd
import os
from collections import Counter

[docs] class ParseDataFrame: """Handle the binary parsing""" def __init__(self, filename, path="./", key = 'history', column_list = ['state','event','S1_state','S2_state'], index_name = 'binary_index', start=None, stop=None, chunk_size = 500000): self.path = path self.filename = filename self.key = key self.column_list = column_list self.index_name = index_name self.chunk_size = chunk_size self.start = start self.stop = stop self.index_list = dict() self.counts = Counter() self._parse_dataf_groupby_col3_chunk() self.count_dict = Counter({self.index_list[k]: self.counts[k] for k in self.counts.keys()}) def _f_lambda(self, df_gb): """function to be given as key argument to DataFrameGroupBy.apply()""" h = hash(tuple(df_gb.to_numpy().ravel()))#.to_numpy() recommanded by pandas doc instead of .values self.counts[h] += 1 self.index_list.setdefault(h, df_gb.index[0]) return None def _parse_dataf_groupby_col3_chunk(self): file_path = os.path.join(self.path, self.filename) rdf = pd.DataFrame(columns = self.column_list, index=pd.Index([], name=self.index_name)) for dataf in pd.read_hdf(file_path, self.key, columns=self.column_list, start = self.start, stop = self.stop, chunksize=self.chunk_size): dataf = pd.concat([rdf, dataf]) rdf = dataf.loc[[dataf.index[-1]]] dataf = dataf.drop(dataf.index[-1]) gb_df_col = dataf.groupby(by=dataf.index.name) gb_df_col.apply(self._f_lambda) self._f_lambda(rdf)
[docs] def get_frequencies(self): total = sum(self.counts.values()) return Counter({self.index_list[k]: 100 * self.counts[k] / total for k in self.counts.keys()})
[docs] def get_most_numpy(self, k): #one can then acess columns for VHDiagramm_m return np.array(self.count_dict.most_common(k))
[docs] def parse_dataf_gb_iter_chunk(dataf, index_list, cnt): """a more relevant parser imo but turns out to take more time than than groupby/apply""" file_path = os.path.join(self.path, self.filename) rdf = pd.DataFrame(columns = self.column_list, index=pd.Index([], name=self.index_name)) for dataf in pd.read_hdf(file_path, self.key, columns=self.column_list, start = self.start, stop = self.stop, chunksize=self.chunk_size): dataf = pd.concat([rdf, dataf]) rdf = dataf.loc[[dataf.index[-1]]] dataf = dataf.drop(dataf.index[-1]) gb_df_col = dataf.groupby(dataf.index.name) for i,s in gb_df_col.__iter__(): h = hash(tuple(df_gb.to_numpy().ravel())) self.counts[h] += 1 self.index_list.setdefault(h, df_gb.index[0])