py_research.db.base module#

Base classes and types for relational database package.

class Table(db, df, source_map, indexes=<factory>)[source]#

Bases: object

Table in a relational database.

Parameters:
db: DB#

Database this table belongs to.

df: DataFrame#

Dataframe containing the data of this table.

source_map: str | dict[str, str]#

Mapping to source tables of this table.

For single source tables, this is a string containing the name of the source table.

For multiple source tables, the dataframe hast multi-level columns. source_map is then a mapping from the first level of these columns to the source tables.

indexes: dict[str, str]#

Mapping from source table names to index column names.

static from_excel(path)[source]#

Load relational table from Excel file.

Parameters:

path (Path) –

Return type:

Table

to_excel(path)[source]#

Save this table to an Excel file.

Parameters:

path (Path) –

Return type:

None

filter(filter_series)[source]#

Filter this table.

Parameters:

filter_series (Series) – Boolean series to filter the table with.

Returns:

New table containing the filtered data.

Return type:

Table

merge(right: SingleTable | None = None, link_to_right: str | tuple[str, str] = None, link_to_left: str | None = None, link_table: SingleTable | None = None, naming: Literal['source', 'path'] = 'source') Table[source]#
merge(right: SingleTable = None, link_to_right: str | tuple[str, str] | None = None, link_to_left: str | None = None, link_table: SingleTable | None = None, naming: Literal['source', 'path'] = 'source') Table
merge(right: SingleTable | None = None, link_to_right: str | tuple[str, str] | None = None, link_to_left: str | None = None, link_table: SingleTable = None, naming: Literal['source', 'path'] = 'source') Table

Merge this table with another, returning a new table.

Parameters:
  • link_to_right – Name of column to use for linking from left to right table.

  • link_to_left – Name of column to use for linking from right to left table.

  • right – Other (left) table to merge with.

  • link_table – Link table (join table) to use for double merging.

  • naming – Naming strategy to use for naming the first level of merged columns. Use “path” if you merge multiple times from the same source table.

Note

At least one of link_to_right, right or link_table must be supplied.

Returns:

New table containing the merged data. The returned table will have a multi-level column index, where the first level references the source table of each column via the source_map attribute.

flatten(sep='.', prefix_strategy='always')[source]#

Collapse multi-dim. column labels of multi-source table, returning new df.

Parameters:
  • sep (str) – Separator to use between column levels.

  • prefix_strategy (Literal['always', 'on_conflict']) – Strategy to use for prefixing column names.

Returns:

Dataframe representation of this table with flattened multi-dim columns.

Return type:

DataFrame

extract(with_relations=True)[source]#

Extract this table into its own database.

Returns:

Database containing only this table.

Parameters:

with_relations (bool) –

Return type:

DB

keys()[source]#
Return type:

Iterable[str]

get(name, default=None)[source]#
Parameters:
  • name (str) –

  • default (Any) –

Return type:

Any

property columns: Sequence[str | tuple[str, str]]#

Return the columns of this table.

class SingleTable(name, db, df)[source]#

Bases: Table

Relational database table with a single source table.

Parameters:
property source_map: str#

Name of the source table of this table.

property indexes: dict[str, str]#

Name of the source table of this table.

filter(filter_series)[source]#

Return a filtered version of this table.

Parameters:

filter_series (Series) –

Return type:

SingleTable

trim(cols)[source]#

Return a trimmed version of this table.

Parameters:

cols (list[str]) – Columns to keep.

Return type:

SingleTable

extend(other, conflict_policy='raise')[source]#

Extend this table with data from another, returning a new table.

Parameters:
  • other (DataFrame) – Other table to extend with.

  • conflict_policy (Literal['raise', 'ignore', 'override'] | dict[str, ~typing.Literal['raise', 'ignore', 'override']]) – Policy to use for resolving conflicts. Can be a global setting, per-column via supplying a dict with column names as keys.

Returns:

New table containing the extended data.

Return type:

SingleTable

class SourceTable(name, db)[source]#

Bases: SingleTable

Original table in a relational database.

Parameters:
  • name (str) –

  • db (DB) –

property df: DataFrame#

Return the dataframe of this table.

class DBSchema[source]#

Bases: object

Base class for static database schemas.

class DB(table_dfs=<factory>, relations=<factory>, join_tables=<factory>, schema=None, updates=<factory>, backend=None, _copied=False)[source]#

Bases: object

Relational database consisting of multiple named tables.

Parameters:
table_dfs: dict[str, DataFrame]#

Dataframes containing the data of each table in this database.

relations: dict[tuple[str, str], tuple[str, str]]#

Relations between tables in this database.

join_tables: set[str]#

Names of tables that are used as n-to-m join tables in this database.

schema: type[DBSchema] | None = None#

Schema of this database.

updates: dict[Timestamp, dict[str, Any]]#

List of update times with comments, tags, authors, etc.

backend: Path | None = None#

File backend of this database, hence where it was loaded from and is saved to by default.

static load(path, auto_parse_dtype=False)[source]#

Load a database from an excel file.

Parameters:
  • path (Path | str) – Path to the excel file.

  • auto_parse_dtype (bool) – Whether to automatically parse the dtypes of the data.

Returns:

Database object.

Return type:

DB

save(path=None)[source]#

Save this database to an excel file.

Parameters:

path (Path | str | None) – Path to excel file. Will be overridden if it exists. Uses this database’s backend as default path, if none given.

Return type:

None

describe()[source]#

Return a description of this database.

Returns:

Mapping of table names to table descriptions.

Return type:

dict[str, str | dict[str, str]]

copy(deep=True)[source]#

Create a copy of this database, optionally deep.

Parameters:

deep (bool) – Whether to perform a deep copy (copy all dataframes).

Returns:

Copy of this database.

Return type:

DB

extend(other, conflict_policy='raise')[source]#

Extend this database with data from another, returning a new database.

Parameters:
  • other (DB | dict[str, DataFrame] | Table) – Other database, dataframe dict or table to extend with.

  • conflict_policy (Literal['raise', 'ignore', 'override'] | dict[str, ~typing.Literal['raise', 'ignore', 'override'] | dict[str, ~typing.Literal['raise', 'ignore', 'override']]]) – Policy to use for resolving conflicts. Can be a global setting, per-table via supplying a dict with table names as keys, or per-column via supplying a dict of dicts.

Returns:

New database containing the extended data.

Return type:

DB

trim(centers=None, circuit_breakers=None)[source]#

Return new database minus orphan data (relative to given centers).

Parameters:
  • centers (list[str] | None) – Tables to use as centers for the trim.

  • circuit_breakers (list[str] | None) – Tables to use as circuit breakers for the trim.

Returns:

New database containing the trimmed data.

Return type:

DB

filter(filters, extra_cb=None)[source]#

Return new db only containing data related to rows matched by filters.

Parameters:
  • filters (dict[str, Series]) – Mapping of table names to boolean filter series

  • extra_cb (list[str] | None) – Additional circuit breakers (on top of the filtered tables) to use when trimming the database according to the filters

Returns:

New database containing the filtered data. The returned database will only contain the filtered tables and all tables that have (indirect) references to them.

Return type:

DB

Note

This is equivalent to trimming the database with the filtered tables as centers and the filtered tables and extra_cb as circuit breakers.

to_graph(nodes)[source]#

Export links between select database objects in a graph format.

E.g. for usage with Gephi

Parameters:

nodes (Sequence[Table | str]) –

Return type:

tuple[DataFrame, DataFrame]

keys()[source]#
Return type:

set[str]

values()[source]#
Return type:

Iterable[SingleTable]

items()[source]#
Return type:

Iterable[tuple[str, SingleTable]]

get(name)[source]#
Parameters:

name (str) –

Return type:

SingleTable | None