import itertools
import logging
from functools import partial, wraps
from pathlib import Path
from typing import Literal
import petl
import requests
from github import Auth as PyGithubAuth
from github import Github as PyGithub
from github.GithubException import UnknownObjectException
from parsons.etl.table import Table
from parsons.utilities import check_env, files
logger = logging.getLogger(__name__)
def _wrap_method(decorator, method):
@wraps(method)
def _wrapper(self, *args, **kwargs):
bound_method = partial(method.__get__(self, type(self)))
return decorator(bound_method)(*args, **kwargs)
return _wrapper
def decorate_methods(decorator):
# Based on Django's django.utils.decorators.method_decorator
def decorate(cls):
for method in dir(cls):
# Don't decorate dunder methods
if method.startswith("__"):
continue
cls_method = getattr(cls, method)
if callable(cls_method):
setattr(cls, method, _wrap_method(decorator, cls_method))
return cls
return decorate
def wrap_github_404(func):
@wraps(func)
def _wrapped_func(*args, **kwargs):
try:
return (func)(*args, **kwargs)
except UnknownObjectException as e:
raise ParsonsGitHubError(
"Couldn't find the object you referenced, maybe you need to log in?"
) from e
return _wrapped_func
class ParsonsGitHubError(Exception):
pass
[docs]
@decorate_methods(wrap_github_404)
class GitHub:
"""Creates a GitHub class for accessing the GitHub API.
Uses ``parsons.utilities.check_env`` to load credentials from environment variables if not
supplied. Supports either a username and password or an access token for authentication. The
client also supports unauthenticated access.
Args:
username: Optional[str]
Username of account to use for credentials. Can be set with ``GITHUB_USERNAME``
environment variable.
password: Optional[str]
Password of account to use for credentials. Can be set with ``GITHUB_PASSWORD``
environment variable.
access_token: Optional[str]
Access token to use for credentials. Can be set with ``GITHUB_ACCESS_TOKEN`` environment
variable.
"""
def __init__(self, username=None, password=None, access_token=None):
self.username = check_env.check("GITHUB_USERNAME", username, optional=True)
self.password = check_env.check("GITHUB_PASSWORD", password, optional=True)
self.access_token = check_env.check("GITHUB_ACCESS_TOKEN", access_token, optional=True)
if self.username and self.password:
self.client = PyGithub(
auth=PyGithubAuth.Login(login=self.username, password=self.password)
)
elif self.access_token:
self.client = PyGithub(auth=PyGithubAuth.Token(token=self.access_token))
else:
self.client = PyGithub()
def _as_table(self, paginated_list, page=None, page_size=100):
"""Converts a paginated list into a Parsons ``Table``. Uses the ``_rawData`` property of
each item instead of calling ``raw_data`` to avoid making a separate request for each item
in a page for types that PyGithub doesn't consider complete.
Args:
paginated_list: ``pygithub.PaginatedList.PaginatedList``
PyGithub paginated list
page: Optional[int]
Page number to load. Defaults to None. If not specified, all results are returned.
page_size: int
Page size. Defaults to 100. Ignored if ``page`` is not set.
Returns:
``Table``
Table object created from the raw data of the list
"""
stream = (item._rawData for item in paginated_list)
if page is not None:
start = (page - 1) * page_size
stop = start + page_size
stream = itertools.islice(stream, start, stop)
return Table(list(stream))
[docs]
def get_user(self, username):
"""Loads a GitHub user by username
Args:
username: str
Username of user to load
Returns:
dict
User information
"""
return self.client.get_user(username).raw_data
[docs]
def get_organization(self, organization_name):
"""Loads a GitHub organization by name
Args:
organization_name: str
Name of organization to load
Returns:
dict
Organization information
"""
return self.client.get_organization(organization_name).raw_data
[docs]
def get_repo(self, repo_name):
"""Loads a GitHub repo by name
Args:
repo_name: str
Full repo name (account/name)
Returns:
dict
Repo information
"""
return self.client.get_repo(repo_name).raw_data
[docs]
def list_user_repos(self, username, page=None, page_size=100):
"""List user repos with pagination, returning a ``Table``
Args:
username: str
GitHub username
page: Optional[int]
Page number. All results are returned if not set.
page_size: int
Page size. Defaults to 100.
Returns:
``Table``
Table with page of user repos
"""
logger.info(f"Listing page {page} of repos for user {username}")
return self._as_table(
self.client.get_user(username).get_repos(), page=page, page_size=page_size
)
[docs]
def list_organization_repos(self, organization_name, page=None, page_size=100):
"""List organization repos with pagination, returning a ``Table``
Args:
organization_name: str
GitHub organization name
page: Optional[int]
Page number. All results are returned if not set.
page_size: int
Page size. Defaults to 100.
Returns:
``Table``
Table with page of organization repos
"""
logger.info(f"Listing page {page} of repos for organization {organization_name}")
return self._as_table(
self.client.get_organization(organization_name).get_repos(),
page=page,
page_size=page_size,
)
[docs]
def get_issue(self, repo_name, issue_number):
"""Loads a GitHub issue
Args:
repo_name: str
Full repo name (account/name)
issue_number: int
Number of issue to load
Returns:
dict
Issue information
"""
return self.client.get_repo(repo_name).get_issue(number=issue_number).raw_data
[docs]
def list_repo_issues(
self,
repo_name,
state: Literal["open", "closed", "all"] = "open",
assignee=None,
creator=None,
mentioned=None,
labels=None,
sort: Literal["created", "updated", "comments"] = "created",
direction: Literal["asc", "desc"] = "desc",
since=None,
page=None,
page_size=100,
):
"""List issues for a given repo
Args:
repo_name: str
Full repo name (account/name)
state: str
State of issues to return. One of "open", "closed", "all". Defaults to "open".
assignee: Optional[str]
Name of assigned user, "none", or "*".
creator: Optional[str]
Name of user that created the issue.
mentioned: Optional[str]
Name of user mentioned in the issue.
labels: list[str]
List of label names. Defaults to []
sort: str
What to sort results by. One of "created", "updated", "comments". Defaults to
"created".
direction: str
Direction to sort. One of "asc", "desc". Defaults to "desc".
since: Optional[Union[datetime.datetime, datetime.date]]
Timestamp to pull issues since. Defaults to None.
page: Optional[int]
Page number. All results are returned if not set.
page_size: int
Page size. Defaults to 100.
Returns:
``Table``
Table with page of repo issues
"""
if labels is None:
labels = []
logger.info(f"Listing page {page} of issues for repo {repo_name}")
kwargs_dict = {"state": state, "sort": sort, "direction": direction}
if assignee:
kwargs_dict["assignee"] = assignee
if creator:
kwargs_dict["creator"] = creator
if mentioned:
kwargs_dict["mentioned"] = mentioned
if len(labels) > 0:
kwargs_dict["labels"] = ",".join(labels)
if since:
kwargs_dict["since"] = f"{since.isoformat()[:19]}Z"
return self._as_table(
self.client.get_repo(repo_name).get_issues(**kwargs_dict),
page=page,
page_size=page_size,
)
[docs]
def get_pull_request(self, repo_name, pull_request_number):
"""Loads a GitHub pull request
Args:
repo_name: str
Full repo name (account/name)
pull_request_number: int
Pull request number
Returns:
dict
Pull request information
"""
return self.client.get_repo(repo_name).get_pull(pull_request_number).raw_data
[docs]
def list_repo_pull_requests(
self,
repo_name,
state: Literal["open", "closed", "all"] = "open",
base=None,
sort: Literal["created", "updated", "popularity"] = "created",
direction: Literal["asc", "desc"] = "desc",
page=None,
page_size=100,
):
"""Lists pull requests for a given repo
Args:
repo_name: str
Full repo name (account/name)
state: str
One of "open, "closed", "all". Defaults to "open".
base: Optional[str]
Base branch to filter pull requests by.
sort: str
How to sort pull requests. One of "created", "updated", "popularity". Defaults to
"created".
direction: str
Direction to sort by. Defaults to "desc".
page: Optional[int]
Page number. All results are returned if not set.
page_size: int
Page size. Defaults to 100.
Returns:
``Table``
Table with page of repo pull requests
"""
logger.info(f"Listing page {page} of pull requests for repo {repo_name}")
kwargs_dict = {"state": state, "sort": sort, "direction": direction}
if base:
kwargs_dict["base"] = base
return self._as_table(
self.client.get_repo(repo_name).get_pulls(**kwargs_dict),
page=page,
page_size=page_size,
)
[docs]
def list_repo_contributors(self, repo_name, page=None, page_size=100):
"""Lists contributors for a given repo
Args:
repo_name: str
Full repo name (account/name)
page: Optional[int]
Page number. All results are returned if not set.
page_size: int
Page size. Defaults to 100.
Returns:
``Table``
Table with page of repo contributors
"""
logger.info(f"Listing page {page} of contributors for repo {repo_name}")
return self._as_table(
self.client.get_repo(repo_name).get_contributors(),
page=page,
page_size=page_size,
)
[docs]
def download_file(self, repo_name, path, branch=None, local_path=None):
"""Download a file from a repo by path and branch. Defaults to the repo's default branch if
branch is not supplied.
Uses the download_url directly rather than the API because the API only supports contents up
to 1MB from a repo directly, and the process for downloading larger files through the API is
much more involved.
Because download_url does not go through the API, it does not support username / password
authentication, and requires a token to authenticate.
Args:
repo_name: str
Full repo name (account/name)
path: str
Path from the repo base directory
branch: Optional[str]
Branch to download file from. Defaults to repo default branch
local_path: Optional[str]
Local file path to download file to. Will create a temp file if not supplied.
Returns:
str
File path of downloaded file
"""
if not local_path:
local_path = files.create_temp_file_for_path(path)
repo = self.client.get_repo(repo_name)
if branch is None:
branch = repo.default_branch
logger.info(f"Downloading {path} from {repo_name}, branch {branch} to {local_path}")
headers = None
if self.access_token:
headers = {
"Authorization": f"token {self.access_token}",
}
res = requests.get(
f"https://raw.githubusercontent.com/{repo_name}/{branch}/{path}",
headers=headers,
)
if res.status_code == 404:
raise UnknownObjectException(status=404, data=res.content)
elif res.status_code != 200:
raise ParsonsGitHubError(
f"Error downloading {path} from repo {repo_name}: {res.content}"
)
Path(local_path).write_bytes(res.content)
logger.info(f"Downloaded {path} to {local_path}")
return local_path
[docs]
def download_table(self, repo_name, path, branch=None, local_path=None, delimiter=","):
"""Download a CSV file from a repo by path and branch as a Parsons Table.
Args:
repo_name: str
Full repo name (account/name)
path: str
Path from the repo base directory
branch: Optional[str]
Branch to download file from. Defaults to repo default branch
local_path: Optional[str]
Local file path to download file to. Will create a temp file if not supplied.
delimiter: Optional[str]
The CSV delimiter to use to parse the data. Defaults to ','
Returns:
Parsons Table
See :ref:`parsons-table` for output options.
"""
downloaded_file = self.download_file(repo_name, path, branch, local_path)
return Table(petl.fromcsv(downloaded_file, delimiter=delimiter))