Source code for posydon.popsyn.analysis

"""Module for analyzing binary population simulation results."""


__authors__ = [
    "Konstantinos Kovlakas <Konstantinos.Kovlakas@unige.ch>",
]


import pandas as pd


CHUNKSIZE = 100000
CHUNKSIZE_SMALL = 1000


[docs] def report_columns(path): """Print the columns in the `oneline` and `history` keys in an HDF5 file. Parameters ---------- path : str The fullpath of the HDF5 file. """ print("Population file:", path) for tablename in ["oneline", "history"]: for chunk in pd.read_hdf(path, tablename, chunksize=1): print("\nColumns in `{}`\n{}\n{}".format( tablename, "-"*20, " | ".join(chunk.columns))) break
def _decide_chunksize(n_binaries): """Decide the HDF5 chunk size when searching for n_binaries.""" if n_binaries is None: return CHUNKSIZE return min(CHUNKSIZE, max(CHUNKSIZE_SMALL, n_binaries // 10)) def _filter_columns(columns, patterns): """Select a subset of columns based on a substring or array of substrings. Parameters ---------- columns : array of str The original columns to be filtered. patterns : str or array of str The pattern or list of patterns, where pattern is a substring. Returns ------- array of str Subset of the columns on the basis of the inclusion of substring(s). """ if patterns is None: # return input if undefined pattern return list(columns) if isinstance(patterns, (str, bytes)): # in the case of one pattern patterns = [patterns] selected_cols = [] for col in list(columns): include_it = False for pattern in patterns: if pattern in col: include_it = True break if include_it: selected_cols.append(col) return selected_cols
[docs] def get_subsets(path, find_n=None, select_oneline=None, select_history=None, select_columns=None, verbose=True): """Get the `oneline` and `history` subsets for a selection of binaries. You select binaries from an HDF5 population by providing a function that operates either on `oneline` or `history` and returns a boolean mask in- dicating which rows correspond to the binaries of interest. E.g., def find_no_end(df): return df["event_f"] != "END" can be used to find binaries that ended abruptly, based on the information in the `oneline` table. Therefore, we set `select_oneline=find_no_end`. If no selector has been set, all binaries are selected (combine this with the `select_columns` argument to get a column subset of the data.) Parameters ---------- path : str The fullpath of the HDF5 file. find_n : int or None The number of binaries to select (at most) or None to select all. select_oneline : function or None The function to operate on chunks of the `oneline` table. select_history : function or None The function to operate on chunks of the `oneline` table. select_columns : None or str or array If None, all columns will be returned. If a string, only those columns with name containing this string will be returned. An array of strings can be used for multiple such patterns. Returns ------- oneline, history The subsets of the dataframes with the data for the selected binaries. """ def say(*args, **kwargs): """Print only if verbose is True.""" if verbose: print(*args, **kwargs) if select_oneline is not None and select_history is not None: raise Exception("Cannot use both `oneline` and `history` selectors.") elif select_oneline is not None: selector = select_oneline tablename = "oneline" elif select_history is not None: selector = select_history tablename = "history" else: selector = None chunksize = _decide_chunksize(find_n) say("Chunk size:", chunksize) if select_columns is not None: say("Deciding which columns to include...") for chunk in pd.read_hdf(path, "oneline", chunksize=1): oneline_columns = _filter_columns(chunk.columns, select_columns) say(" Selected oneline columns:", oneline_columns) break for chunk in pd.read_hdf(path, "history", chunksize=1): history_columns = _filter_columns(chunk.columns, select_columns) say(" Selected history columns:", history_columns) break else: oneline_columns, history_columns = None, None say("Applying the selector...") if selector is None: say(" No selector defined. Selecting all.") else: all_indices, selected_indices = set(), set() for chunk in pd.read_hdf(path, tablename, chunksize=chunksize): all_indices.update(chunk.index) selection = selector(chunk) selected_indices.update(chunk[selection].index) say(" Selected {:7d} out of {:7d} binaries ({:.3g}%)...\t". format(len(selected_indices), len(all_indices), 100*len(selected_indices)/len(all_indices)), end="\r") if find_n is not None and len(selected_indices) >= find_n: selected_indices = set(list(selected_indices)[:find_n]) say("\n Found {} binaries.".format(find_n)) break say() say("Obtaining `oneline` for the selection...") selected_oneline, included_indices = pd.DataFrame(), set() found_them = False for chunk in pd.read_hdf(path, "oneline", chunksize=chunksize, columns=oneline_columns): if selector is not None: chunk = chunk[chunk.index.isin(selected_indices)] included_indices.update(chunk.index) found_them = len(included_indices) == len(selected_indices) selected_oneline = pd.concat([selected_oneline, chunk]) if found_them: say(" Found the binaries.") break say("Obtaining `history` for the selection...") selected_history, included_indices = pd.DataFrame(), set() stop_at_next_chunk, found_them = False, False for chunk in pd.read_hdf(path, "history", chunksize=chunksize, columns=history_columns): if selector is not None: chunk = chunk[chunk.index.isin(selected_indices)] included_indices.update(chunk.index) found_them = len(included_indices) == len(selected_indices) selected_history = pd.concat([selected_history, chunk]) if found_them: if not stop_at_next_chunk: stop_at_next_chunk = True else: say(" Found the binaries.") break return selected_oneline, selected_history