Source code for parsons.etl.table

import logging
import pickle
from enum import Enum
from typing import Union

import petl

from parsons.etl.etl import ETL
from parsons.etl.tofrom import ToFrom
from parsons.utilities import files

logger = logging.getLogger(__name__)

DIRECT_INDEX_WARNING_COUNT = 10


class _EmptyDefault(Enum):
    """Default argument for Table()

    This is used because Table(None) should not be allowed, but we
    need a default argument that isn't the mutable []

    See https://stackoverflow.com/a/76606310 for discussion."""

    token = 0


_EMPTYDEFAULT = _EmptyDefault.token


[docs]class Table(ETL, ToFrom):
    """
    Create a Parsons Table. Accepts one of the following:
    - A list of lists, with list[0] holding field names, and the other lists holding data
    - A list of dicts
    - A petl table

    `Args:`
        lst: list
            See above for accepted list formats
        source: str
            The original data source from which the data was pulled (optional)
        name: str
            The name of the table (optional)
    """

    def __init__(
        self,
        lst: Union[list, tuple, petl.util.base.Table, _EmptyDefault] = _EMPTYDEFAULT,
    ):
        self.table = None

        # Normally we would use None as the default argument here
        # Instead of using None, we use a sentinal
        # This allows us to maintain the existing behavior
        # This is allowed: Table()
        # This should fail: Table(None)
        if lst is _EMPTYDEFAULT:
            self.table = petl.fromdicts([])

        elif isinstance(lst, list) or isinstance(lst, tuple):
            # Check for empty list
            if not len(lst):
                self.table = petl.fromdicts([])
            else:
                row_type = type(lst[0])
                # Check for list of dicts
                if row_type == dict:
                    self.table = petl.fromdicts(lst)
                # Check for list of lists
                elif row_type in [list, tuple]:
                    self.table = petl.wrap(lst)

        elif isinstance(lst, petl.util.base.Table):
            # Create from a petl table
            self.table = lst

        else:
            raise ValueError(
                f"Could not initialize table from input type. "
                f"Got {type(lst)}, expected list, tuple, or petl Table"
            )

        if not self.is_valid_table():
            raise ValueError("Could not create Table")

        # Count how many times someone is indexing directly into this table, so we can warn
        # against inefficient usage.
        self._index_count = 0

    def __repr__(self):
        return repr(petl.dicts(self.table))

    def __iter__(self):
        return iter(petl.dicts(self.table))

    def __getitem__(self, index):
        if isinstance(index, int):
            return self.row_data(index)

        elif isinstance(index, str):
            return self.column_data(index)

        elif isinstance(index, slice):
            tblslice = petl.rowslice(self.table, index.start, index.stop, index.step)
            return [row for row in tblslice]

        else:
            raise TypeError("You must pass a string or an index as a value.")

    def __bool__(self):
        # Try to get a single row from our table
        head_one = petl.head(self.table)

        # See if our single row is empty
        return petl.nrows(head_one) > 0

    def _repr_html_(self):
        """
        Leverage Petl functionality to display well formatted tables in Jupyter Notebook.
        """

        return self.table._repr_html_()

    @property
    def num_rows(self):
        """
        `Returns:`
            int
                Number of rows in the table
        """
        return petl.nrows(self.table)

    def __len__(self):
        return self.num_rows

    @property
    def data(self):
        """
        Returns an iterable object for iterating over the raw data rows as tuples
        (without field names)
        """
        return petl.data(self.table)

    @property
    def columns(self):
        """
        `Returns:`
            list
                List of the table's column names
        """
        return list(petl.header(self.table))

    @property
    def first(self):
        """
        Returns the first value in the table. Useful for database queries that only
        return a single value.
        """

        try:
            return self.data[0][0]

        # If first value is empty, return None
        except IndexError:
            return None

    def row_data(self, row_index):
        """
        Returns a row in table

        `Args:`
            row_index: int
        `Returns:`
            dict
                A dictionary of the row with the column as the key and the cell
                as the value.
        """

        self._index_count += 1
        if self._index_count >= DIRECT_INDEX_WARNING_COUNT:
            logger.warning(
                """
                You have indexed directly into this Table multiple times. This can be inefficient,
                as data transformations you've made will be computed _each time_ you index into the
                Table. If you are accessing many rows of data, consider switching to this style of
                iteration, which is much more efficient:
                `for row in table:`
                """
            )

        return petl.dicts(self.table)[row_index]

    def column_data(self, column_name):
        """
        Returns the data in the column as a list.

        `Args:`
            column_name: str
                The name of the column
        `Returns`:
            list
                A list of data in the column.
        """

        if column_name in self.columns:
            return list(self.table[column_name])

        else:
            raise ValueError("Column name not found.")

[docs]    def materialize(self):
        """
        "Materializes" a Table, meaning all data is loaded into memory and all pending
        transformations are applied.

        Use this if petl's lazy-loading behavior is causing you problems, eg. if you want to read
        data from a file immediately.
        """

        self.table = petl.wrap(petl.tupleoftuples(self.table))

[docs]    def materialize_to_file(self, file_path=None):
        """
        "Materializes" a Table, meaning all pending transformations are applied.

        Unlike the original materialize function, this method does not bring the data into memory,
        but instead loads the data into a local temp file.

        This method updates the current table in place.

        `Args:`
            file_path: str
                The path to the file to materialize the table to; if not specified, a temp file
                will be created.
        `Returns:`
            str
                Path to the temp file that now contains the table
        """

        # Load the data in batches, and "pickle" the rows to a temp file.
        # (We pickle rather than writing to, say, a CSV, so that we maintain
        # all the type information for each field.)

        file_path = file_path or files.create_temp_file()

        with open(file_path, "wb") as handle:
            for row in self.table:
                pickle.dump(list(row), handle)

        # Load a Table from the file
        self.table = petl.frompickle(file_path)

        return file_path

    def is_valid_table(self):
        """
        Performs some simple checks on a Table. Specifically, verifies that we have a valid petl
        table within the Parsons Table.

        `Returns:`
            bool
        """

        if not isinstance(self.table, petl.util.base.Table):
            return False

        try:
            self.columns
        except StopIteration:
            return False

        return True

    def empty_column(self, column):
        """
        Checks if a given column is empty. Returns ``True`` if empty and ``False``
        if not empty.

        `Args:`
            column: str
                The column name
        `Returns:`
            bool
        """

        if petl.nrows(petl.selectnotnone(self.table, column)) == 0:
            return True
        else:
            return False