Source code for parsons.etl.tofrom

import petl
import json
import io
import gzip
from parsons.utilities import files, zip_archive



[docs]
class ToFrom(object):


[docs]
    def to_dataframe(self, index=None, exclude=None, columns=None,
                     coerce_float=False):
        """
        Outputs table as a Pandas Dataframe

        `Args:`
            index: str, list
                Field of array to use as the index, alternately a specific set
                of input labels to use
            exclude: list
                Columns or fields to exclude
            columns: list
                Column names to use. If the passed data do not have names
                associated with them, this argument provides names for the
                columns. Otherwise this argument indicates the order of the
                columns in the result (any names not found in the data will
                become all-NA columns)
        `Returns:`
            dataframe
                Pandas DataFrame object
        """

        return petl.todataframe(self.table, index=index, exclude=exclude,
                                columns=columns, coerce_float=coerce_float)



[docs]
    def to_html(self, local_path=None, encoding=None, errors='strict',
                index_header=False, caption=None, tr_style=None,
                td_styles=None, truncate=None):
        """
        Outputs table to html.

        .. warning::
                If a file already exists at the given location, it will be
                overwritten.

        `Args:`
            local_path: str
                The path to write the html locally. If not specified, a temporary file will be
                created and returned, and that file will be removed automatically when the script
                is done running.
            encoding: str
                The encoding type for `csv.writer()
                <https://docs.python.org/2/library/csv.html#csv.writer/>`_
            errors: str
                Raise an Error if encountered
            index_header: boolean
                Prepend index to column names; Defaults to False.
            caption: str
                A caption to include with the html table.
            tr_style: str or callable
                Style to be applied to the table row.
            td_styles: str, dict or callable
                Styles to be applied to the table cells.
            truncate: int
                Length of cell data.
        `Returns:`
            str
                The path of the new file
        """

        if not local_path:
            local_path = files.create_temp_file(suffix=".html")

        petl.tohtml(self.table,
                    source=local_path,
                    encoding=encoding,
                    errors=errors,
                    caption=caption,
                    index_header=index_header,
                    tr_style=tr_style,
                    td_styles=td_styles,
                    truncate=truncate)

        return local_path



[docs]
    def to_csv(self, local_path=None, temp_file_compression=None, encoding=None, errors='strict',
               write_header=True, csv_name=None, **csvargs):
        """
        Outputs table to a CSV. Additional key word arguments are passed to ``csv.writer()``. So,
        e.g., to override the delimiter from the default CSV dialect, provide the delimiter
        keyword argument.

        .. warning::
                If a file already exists at the given location, it will be
                overwritten.

        `Args:`
            local_path: str
                The path to write the csv locally. If it ends in ".gz" or ".zip", the file will be
                compressed. If not specified, a temporary file will be created and returned,
                and that file will be removed automatically when the script is done running.
            temp_file_compression: str
                If a temp file is requested (ie. no ``local_path`` is specified), the compression
                type for that file. Currently "None", "gzip" or "zip" are supported.
                If a ``local_path`` is specified, this argument is ignored.
            encoding: str
                The CSV encoding type for `csv.writer()
                <https://docs.python.org/2/library/csv.html#csv.writer/>`_
            errors: str
                Raise an Error if encountered
            write_header: boolean
                Include header in output
            csv_name: str
                If ``zip`` compression (either specified or inferred), the name of csv file
                within the archive.
            \**csvargs: kwargs
                ``csv_writer`` optional arguments

        `Returns:`
            str
                The path of the new file
        """  # noqa: W605

        # If a zip archive.
        if files.zip_check(local_path, temp_file_compression):
            return self.to_zip_csv(archive_path=local_path,
                                   encoding=encoding,
                                   errors=errors,
                                   write_header=write_header,
                                   csv_name=csv_name,
                                   **csvargs)

        if not local_path:
            suffix = '.csv' + files.suffix_for_compression_type(temp_file_compression)
            local_path = files.create_temp_file(suffix=suffix)

        # Create normal csv/.gzip
        petl.tocsv(self.table,
                   source=local_path,
                   encoding=encoding,
                   errors=errors,
                   write_header=write_header,
                   **csvargs)

        return local_path



[docs]
    def append_csv(self, local_path, encoding=None, errors='strict', **csvargs):
        """
        Appends table to an existing CSV.

        Additional additional key word arguments
        are passed to ``csv.writer()``. So, e.g., to override the delimiter
        from the default CSV dialect, provide the delimiter keyword argument.

        `Args:`
            local_path: str
                The local path of an existing CSV file. If it ends in ".gz", the file will
                be compressed.
            encoding: str
                The CSV encoding type for `csv.writer()
                <https://docs.python.org/2/library/csv.html#csv.writer/>`_
            errors: str
                Raise an Error if encountered
            \**csvargs: kwargs
                ``csv_writer`` optional arguments

        `Returns:`
            str
                The path of the file
        """  # noqa: W605

        petl.appendcsv(self.table,
                       source=local_path,
                       encoding=encoding,
                       errors=errors,
                       **csvargs)
        return local_path



[docs]
    def to_zip_csv(self, archive_path=None, csv_name=None, encoding=None,
                   errors='strict', write_header=True, if_exists='replace', **csvargs):
        """
        Outputs table to a CSV in a zip archive. Additional key word arguments are passed to
        ``csv.writer()``. So, e.g., to override the delimiter from the default CSV dialect,
        provide the delimiter keyword argument. Use thismethod if you would like to write
        multiple csv files to the same archive.

        .. warning::
                If a file already exists in the archive, it will be overwritten.

        `Args:`
            archive_path: str
                The path to zip achive. If not specified, a temporary file will be created and
                returned, and that file will be removed automatically when the script is done
                running.
            csv_name: str
                The name of the csv file to be stored in the archive. If ``None`` will use
                the archive name.
            encoding: str
                The CSV encoding type for `csv.writer()
                <https://docs.python.org/2/library/csv.html#csv.writer/>`_
            errors: str
                Raise an Error if encountered
            write_header: boolean
                Include header in output
            if_exists: str
                If archive already exists, one of 'replace' or 'append'
            \**csvargs: kwargs
                ``csv_writer`` optional arguments

        `Returns:`
            str
                The path of the archive
        """  # noqa: W605

        if not archive_path:
            archive_path = files.create_temp_file(suffix='.zip')

        cf = self.to_csv(encoding=encoding, errors=errors, write_header=write_header, **csvargs)

        if not csv_name:
            csv_name = files.extract_file_name(archive_path, include_suffix=False) + '.csv'

        return zip_archive.create_archive(archive_path, cf, file_name=csv_name,
                                          if_exists=if_exists)



[docs]
    def to_json(self, local_path=None, temp_file_compression=None, line_delimited=False):
        """
        Outputs table to a JSON file

        .. warning::
                If a file already exists at the given location, it will be
                overwritten.

        `Args:`
            local_path: str
                The path to write the JSON locally. If it ends in ".gz", it will be
                compressed first. If not specified, a temporary file will be created and returned,
                and that file will be removed automatically when the script is done running.
            temp_file_compression: str
                If a temp file is requested (ie. no ``local_path`` is specified), the compression
                type for that file. Currently "None" and "gzip" are supported.
                If a ``local_path`` is specified, this argument is ignored.
            line_delimited: bool
                Whether the file will be line-delimited JSON (with a row on each line), or a proper
                JSON file.

        `Returns:`
            str
                The path of the new file
        """

        if not local_path:
            suffix = '.json' + files.suffix_for_compression_type(temp_file_compression)
            local_path = files.create_temp_file(suffix=suffix)

        # Note we don't use the much simpler petl.tojson(), since that method reads the whole
        # table into memory before writing to file.

        if files.is_gzip_path(local_path):
            open_fn = gzip.open
            mode = 'w+t'
        else:
            open_fn = open
            mode = 'w'

        with open_fn(local_path, mode) as file:
            if not line_delimited:
                file.write('[')

            i = 0
            for row in self:
                if i:
                    if not line_delimited:
                        file.write(',')
                    file.write('\n')
                i += 1
                json.dump(row, file)

            if not line_delimited:
                file.write(']')

        return local_path



[docs]
    def to_dicts(self):
        """
        Output table as a list of dicts.

        `Returns:`
            list
        """

        return list(petl.dicts(self.table))



[docs]
    def to_sftp_csv(self, remote_path, host, username, password, port=22, encoding=None,
                    compression=None, errors='strict', write_header=True,
                    rsa_private_key_file=None, **csvargs):
        """
        Writes the table to a CSV file on a remote SFTP server

        `Args:`
            remote_path: str
                The remote path of the file. If it ends in '.gz', the file will be compressed.
            host: str
                The remote host
            username: str
                The username to access the SFTP server
            password: str
                The password to access the SFTP server
            port: int
                The port number of the SFTP server
            encoding: str
                The CSV encoding type for `csv.writer()
                <https://docs.python.org/2/library/csv.html#csv.writer/>`_
            errors: str
                Raise an Error if encountered
            write_header: boolean
                Include header in output
            rsa_private_key_file str
                Absolute path to a private RSA key used
                to authenticate stfp connection
            \**csvargs: kwargs
                ``csv_writer`` optional arguments
        """  # noqa: W605

        from parsons.sftp import SFTP

        sftp = SFTP(host, username, password, port, rsa_private_key_file)

        compression = files.compression_type_for_path(remote_path)

        local_path = self.to_csv(
            temp_file_compression=compression, encoding=encoding, errors=errors,
            write_header=write_header, **csvargs)
        sftp.put_file(local_path, remote_path)



[docs]
    def to_s3_csv(self, bucket, key, aws_access_key_id=None,
                  aws_secret_access_key=None, compression=None, encoding=None,
                  errors='strict', write_header=True, acl='bucket-owner-full-control',
                  public_url=False, public_url_expires=3600, **csvargs):
        """
        Writes the table to an s3 object as a CSV

        `Args:`
            bucket: str
                The s3 bucket to upload to
            key: str
                The s3 key to name the file. If it ends in '.gz' or '.zip', the file will be
                compressed.
            aws_access_key_id: str
                Required if not included as environmental variable
            aws_secret_access_key: str
                Required if not included as environmental variable
            compression: str
                The compression type for the s3 object. Currently "None", "zip" and "gzip" are
                supported. If specified, will override the key suffix.
            encoding: str
                The CSV encoding type for `csv.writer()
                <https://docs.python.org/2/library/csv.html#csv.writer/>`_
            errors: str
                Raise an Error if encountered
            write_header: boolean
                Include header in output
            public_url: boolean
                Create a public link to the file
            public_url_expire: 3600
                The time, in seconds, until the url expires if ``public_url`` set to ``True``.
            acl: str
                The S3 permissions on the file
            \**csvargs: kwargs
                ``csv_writer`` optional arguments
        `Returns:`
            Public url if specified. If not ``None``.
        """  # noqa: W605

        compression = compression or files.compression_type_for_path(key)

        csv_name = files.extract_file_name(key, include_suffix=False) + '.csv'

        # Save the CSV as a temp file
        local_path = self.to_csv(temp_file_compression=compression,
                                 encoding=encoding,
                                 errors=errors,
                                 write_header=write_header,
                                 csv_name=csv_name,
                                 **csvargs)

        # Put the file on S3
        from parsons.aws import S3
        self.s3 = S3(aws_access_key_id=aws_access_key_id,
                     aws_secret_access_key=aws_secret_access_key)
        self.s3.put_file(bucket, key, local_path, acl=acl)

        if public_url:
            return self.s3.get_url(bucket, key, expires_in=public_url_expires)
        else:
            return None



[docs]
    def to_gcs_csv(self, bucket_name, blob_name, app_creds=None, project=None, compression=None,
                   encoding=None, errors='strict', write_header=True, public_url=False,
                   public_url_expires=60, **csvargs):
        """
        Writes the table to a Google Cloud Storage blob as a CSV.

        `Args:`
            bucket_name: str
                The bucket to upload to
            blob_name: str
                The blob to name the file. If it ends in '.gz' or '.zip', the file will be
                compressed.
            app_creds: str
                A credentials json string or a path to a json file. Not required
                if ``GOOGLE_APPLICATION_CREDENTIALS`` env variable set.
            project: str
                The project which the client is acting on behalf of. If not passed
                then will use the default inferred environment.
            compression: str
                The compression type for the csv. Currently "None", "zip" and "gzip" are
                supported. If specified, will override the key suffix.
            encoding: str
                The CSV encoding type for `csv.writer()
                <https://docs.python.org/2/library/csv.html#csv.writer/>`_
            errors: str
                Raise an Error if encountered
            write_header: boolean
                Include header in output
            public_url: boolean
                Create a public link to the file
            public_url_expire: 60
                The time, in minutes, until the url expires if ``public_url`` set to ``True``.
            \**csvargs: kwargs
                ``csv_writer`` optional arguments
        `Returns:`
            Public url if specified. If not ``None``.
        """  # noqa: W605

        compression = compression or files.compression_type_for_path(blob_name)

        csv_name = files.extract_file_name(blob_name, include_suffix=False) + '.csv'

        # Save the CSV as a temp file
        local_path = self.to_csv(temp_file_compression=compression,
                                 encoding=encoding,
                                 errors=errors,
                                 write_header=write_header,
                                 csv_name=csv_name,
                                 **csvargs)

        from parsons.google.google_cloud_storage import GoogleCloudStorage
        gcs = GoogleCloudStorage(app_creds=app_creds, project=project)
        gcs.put_blob(bucket_name, blob_name, local_path)

        if public_url:
            return gcs.get_url(bucket_name, blob_name, expires_in=public_url_expires)
        else:
            return None



[docs]
    def to_redshift(self, table_name, username=None, password=None, host=None,
                    db=None, port=None, **copy_args):
        """
        Write a table to a Redshift database. Note, this requires you to pass
        AWS S3 credentials or store them as environmental variables.

        Args:
            table_name: str
                The table name and schema (``my_schema.my_table``) to point the file.
            username: str
                Required if env variable ``REDSHIFT_USERNAME`` not populated
            password: str
                Required if env variable ``REDSHIFT_PASSWORD`` not populated
            host: str
                Required if env variable ``REDSHIFT_HOST`` not populated
            db: str
                Required if env variable ``REDSHIFT_DB`` not populated
            port: int
                Required if env variable ``REDSHIFT_PORT`` not populated. Port 5439 is typical.
            \**copy_args: kwargs
                See :func:`~parsons.databases.Redshift.copy`` for options.

        Returns:
            ``None``
        """  # noqa: W605

        from parsons.databases.redshift import Redshift
        rs = Redshift(username=username, password=password, host=host, db=db, port=port)
        rs.copy(self, table_name, **copy_args)



[docs]
    def to_postgres(self, table_name, username=None, password=None, host=None,
                    db=None, port=None, **copy_args):
        """
        Write a table to a Postgres database.

        Args:
            table_name: str
                The table name and schema (``my_schema.my_table``) to point the file.
            username: str
                Required if env variable ``PGUSER`` not populated
            password: str
                Required if env variable ``PGPASSWORD`` not populated
            host: str
                Required if env variable ``PGHOST`` not populated
            db: str
                Required if env variable ``PGDATABASE`` not populated
            port: int
                Required if env variable ``PGPORT`` not populated.
            \**copy_args: kwargs
                See :func:`~parsons.databases.Postgres.copy`` for options.

        Returns:
            ``None``
        """  # noqa: W605

        from parsons.databases.postgres import Postgres
        pg = Postgres(username=username, password=password, host=host, db=db, port=port)
        pg.copy(self, table_name, **copy_args)


    def to_petl(self):

        return self.table


[docs]
    def to_civis(self, table, api_key=None, db=None, max_errors=None,
                 existing_table_rows='fail', diststyle=None, distkey=None,
                 sortkey1=None, sortkey2=None, wait=True, **civisargs):
        """
        Write the table to a Civis Redshift cluster. Additional key word
        arguments can passed to `civis.io.dataframe_to_civis()
        <https://civis-python.readthedocs.io/en/v1.9.0/generated/civis.io.dataframe_to_civis.html#civis.io.dataframe_to_civis>`_ # noqa: E501

        `Args`
            table: str
                The schema and table you want to upload to. E.g.,
                'scratch.table'. Schemas or tablenames with periods must be
                double quoted, e.g. 'scratch."my.table"'.
            api_key: str
                Your Civis API key. If not given, the CIVIS_API_KEY environment
                variable will be used.
            db: str or int
                The Civis Database. Can be database name or ID
            max_errors: int
                The maximum number of rows with errors to remove from
                the import before failing.
            diststyle: str
                The distribution style for the table. One of `'even'`, `'all'`
                or `'key'`.
            existing_table_rows: str
                The behaviour if a table with the requested name already
                exists. One of `'fail'`, `'truncate'`, `'append'` or `'drop'`.
                Defaults to `'fail'`.
            distkey: str
                The column to use as the distkey for the table.
            sortkey1: str
                The column to use as the sortkey for the table.
            sortkey2: str
                The second column in a compound sortkey for the table.
            wait: boolean
                Wait for write job to complete before exiting method.
        """

        from parsons.civis.civisclient import CivisClient
        civis = CivisClient(db=db, api_key=api_key)
        return civis.table_import(
            self, table, max_errors=max_errors,
            existing_table_rows=existing_table_rows, diststyle=diststyle,
            distkey=distkey, sortkey1=sortkey1, sortkey2=sortkey2, wait=wait,
            **civisargs)



[docs]
    @classmethod
    def from_csv(cls, local_path, **csvargs):
        """
        Create a ``parsons table`` object from a CSV file

        `Args:`
            local_path: obj
                A csv formatted local path, url or ftp. If this is a
                file path that ends in ".gz", the file will be decompressed first.
            \**csvargs: kwargs
                ``csv_reader`` optional arguments
        `Returns:`
            Parsons Table
                See :ref:`parsons-table` for output options.
        """  # noqa: W605

        remote_prefixes = ["http://", "https://", "ftp://", "s3://"]
        if any(map(local_path.startswith, remote_prefixes)):
            is_remote_file = True
        else:
            is_remote_file = False

        if not is_remote_file and not files.has_data(local_path):
            raise ValueError('CSV file is empty')

        return cls(petl.fromcsv(local_path, **csvargs))



[docs]
    @classmethod
    def from_csv_string(cls, str, **csvargs):
        """
        Create a ``parsons table`` object from a string representing a CSV.

        `Args:`
            str: str
                The string object to convert to a table
            **csvargs: kwargs
                ``csv_reader`` optional arguments
        `Returns:`
            Parsons Table
                See :ref:`parsons-table` for output options.
        """

        bytesio = io.BytesIO(str.encode('utf-8'))
        memory_source = petl.io.sources.MemorySource(bytesio.read())
        return cls(petl.fromcsv(memory_source, **csvargs))



[docs]
    @classmethod
    def from_columns(cls, cols, header=None):
        """
        Create a ``parsons table`` from a list of lists organized as columns

        `Args:`
            cols: list
                A list of lists organized as columns
            header: list
                List of column names. If not specified, will use dummy column names
        `Returns:`
            Parsons Table
                See :ref:`parsons-table` for output options.
        """

        return cls(petl.fromcolumns(cols, header=header))



[docs]
    @classmethod
    def from_json(cls, local_path, header=None, line_delimited=False):
        """
        Create a ``parsons table`` from a json file

        `Args:`
            local_path: list
                A JSON formatted local path, url or ftp. If this is a
                file path that ends in ".gz", the file will be decompressed first.
            header: list
                List of columns to use for the destination table. If omitted, columns will
                be inferred from the initial data in the file.
            line_delimited: bool
                Whether the file is line-delimited JSON (with a row on each line), or a proper
                JSON file.
        `Returns:`
            Parsons Table
                See :ref:`parsons-table` for output options.
        """

        if line_delimited:
            if files.is_gzip_path(local_path):
                open_fn = gzip.open
            else:
                open_fn = open

            with open_fn(local_path, 'r') as file:
                rows = [json.loads(line) for line in file]
            return cls(rows)

        else:
            return cls(petl.fromjson(local_path, header=header))



[docs]
    @classmethod
    def from_redshift(cls, sql, username=None, password=None, host=None,
                      db=None, port=None):
        """
        Create a ``parsons table`` from a Redshift query.

        To pull an entire Redshift table, use a query like ``SELECT * FROM tablename``.

        `Args:`
            sql: str
                A valid SQL statement
            username: str
                Required if env variable ``REDSHIFT_USERNAME`` not populated
            password: str
                Required if env variable ``REDSHIFT_PASSWORD`` not populated
            host: str
                Required if env variable ``REDSHIFT_HOST`` not populated
            db: str
                Required if env variable ``REDSHIFT_DB`` not populated
            port: int
                Required if env variable ``REDSHIFT_PORT`` not populated. Port 5439 is typical.

        `Returns:`
            Parsons Table
                See :ref:`parsons-table` for output options.
        """

        from parsons.databases.redshift import Redshift
        rs = Redshift(username=username, password=password, host=host, db=db, port=port)
        return rs.query(sql)



[docs]
    @classmethod
    def from_postgres(cls, sql, username=None, password=None, host=None, db=None, port=None):
        """
        Args:
            sql: str
                A valid SQL statement
            username: str
                Required if env variable ``PGUSER`` not populated
            password: str
                Required if env variable ``PGPASSWORD`` not populated
            host: str
                Required if env variable ``PGHOST`` not populated
            db: str
                Required if env variable ``PGDATABASE`` not populated
            port: int
                Required if env variable ``PGPORT`` not populated.
        """

        from parsons.databases.postgres import Postgres
        pg = Postgres(username=username, password=password, host=host, db=db, port=port)
        return pg.query(sql)



[docs]
    @classmethod
    def from_s3_csv(cls, bucket, key, from_manifest=False, aws_access_key_id=None,
                    aws_secret_access_key=None, **csvargs):
        """
        Create a ``parsons table`` from a key in an S3 bucket.

        `Args:`
            bucket: str
                The S3 bucket.
            key: str
                The S3 key
            from_manifest: bool
                If True, treats `key` as a manifest file and loads all urls into a `parsons.Table`.
                Defaults to False.
            aws_access_key_id: str
                Required if not included as environmental variable.
            aws_secret_access_key: str
                Required if not included as environmental variable.
            \**csvargs: kwargs
                ``csv_reader`` optional arguments
        `Returns:`
            `parsons.Table` object
        """  # noqa: W605

        from parsons.aws import S3
        s3 = S3(aws_access_key_id, aws_secret_access_key)

        if from_manifest:
            with open(s3.get_file(bucket, key)) as fd:
                manifest = json.load(fd)

            s3_keys = [x["url"] for x in manifest["entries"]]

        else:
            s3_keys = [f"s3://{bucket}/{key}"]

        tbls = []
        for key in s3_keys:
            # TODO handle urls that end with '/', i.e. urls that point to "folders"
            _, _, bucket_, key_ = key.split("/", 3)
            file_ = s3.get_file(bucket_, key_)
            if files.compression_type_for_path(key_) == 'zip':
                file_ = zip_archive.unzip_archive(file_)

            tbls.append(petl.fromcsv(file_, **csvargs))

        return cls(petl.cat(*tbls))



[docs]
    @classmethod
    def from_dataframe(cls, dataframe, include_index=False):
        """
        Create a ``parsons table`` from a Pandas dataframe.

        `Args:`
            dataframe: dataframe
                A valid Pandas dataframe objectt
            include_index: boolean
                Include index column
        """

        return cls(petl.fromdataframe(dataframe, include_index=include_index))