Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add contributions by contributor metric and tests #7

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions broomstick/data/es/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,16 @@ def exclude_org(s, org_name):
return s.exclude('term', author_org_name=org_name)


def ignore_bots(s):
"""Adds a filter for excluding bots.

:param s: the search we want to update.

:returns: the search with the exclusion filter set.
"""
return s.exclude('bool', author_bot=True)


def filter_org(s, org_name):
"""Adds a filter for retrieving only authors affiliated to the given org.

Expand Down Expand Up @@ -274,3 +284,59 @@ def contributions_count_by_org(data_source,
inplace=True)

return contribs_by_org_df


def contributions_count_by_contributor(data_source,
start_date,
end_date=None,
exclude_bots=True):
""" Gets number of contributions of each contributor.

:param data_source: `broomstick.core.DataSource`
:param start_date: date from which we want to start counting contributions
(exclusive).
:param end_date: date until we want to counts contributions to (inclusive).
`None` by default, means count everything from `start_date`.
:param exclude_bots: whether or not to exclude contributions sent by
bots.
:returns: a Pandas DataFrame with two columns:
- Contributor name.
- The number of contributions sent by that contributor to the
specified data source during the given dates.
"""

s = create_search(data_source=data_source,
start_date=start_date,
end_date=end_date)

if exclude_bots:
s = ignore_bots(s=s)

s.aggs.bucket('contributors',
'terms',
field='author_name',
order={'total_contribs': 'desc'},
size=10000)\
.metric('total_contribs',
'cardinality',
field=DS_ID_FIELD[data_source],
precision_threshold=40000)
s = s[0:0]

buckets = s.execute().to_dict()['aggregations']['contributors']['buckets']

contribs_by_contributor_df = pandas.json_normalize(buckets)

# remove `doc_count` column
contribs_by_contributor_df = contribs_by_contributor_df.drop(['doc_count'], axis=1)

contribs_by_contributor_df.rename(
columns={
'key': 'contributor',
'total_contribs.value': 'contributions'},
inplace=True)

return contribs_by_contributor_df



23 changes: 23 additions & 0 deletions broomstick/metrics/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,26 @@ def contributions_count_by_org(data_source,
start_date=start_date,
end_date=end_date,
exclude_unknown=exclude_unknown)


def contributions_count_by_contributor(data_source,
start_date,
end_date=None,
exclude_bots=True):
""" Gets number of contributions of each contributor.

:param data_source: `broomstick.core.DataSource`
:param start_date: date from which we want to start counting contributions
(exclusive).
:param end_date: date until we want to counts contributions to (inclusive).
`None` by default, means count everything from `start_date`.
:param exclude_bots: whether or not to exclude contributions by bots
:returns: the number of contributions sent to the specified data source.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would document the returns part the same way you did in the called function.

"""

return com.contributions_count_by_contributor(
data_source=data_source,
start_date=start_date,
end_date=end_date,
exclude_bots=exclude_bots)

192 changes: 192 additions & 0 deletions test/test_es_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,21 @@ def test_exclude_org(self):
author_org_name=esc.UNKNOWN_ORG_NAME)
self.assertEqual(result, 'test')

def test_ignore_bots(self):
"""Test add bot exclusion filter.
"""

s = Search()
s.exclude = MagicMock(return_value='test')

result = esc.ignore_bots(s)

s.exclude.assert_called_with(
'bool',
author_bot=True)
self.assertEqual(result, 'test')


def test_filter_org(self):
"""Tests add organization name inclusion filter.
"""
Expand Down Expand Up @@ -587,6 +602,183 @@ def test_contributions_count_by_org(self,

assert_frame_equal(result, expected_df)

@mock.patch('broomstick.data.es.common.create_search')
@mock.patch('broomstick.data.es.common.ignore_bots')
def test_contributions_count_by_contributor(self,
ignore_bots_mock,
create_search_mock):
"""Test count total contributions by contributor method.
"""
response = {
'aggregations': {
'contributors': {
'doc_count_error_upper_bound': 0,
'sum_other_doc_count': 0,
'buckets': [
{
'key': 'Anne',
'doc_count': 213,
'total_contribs': {
'value': 179
}
},
{
'key': 'Bob',
'doc_count': 130,
'total_contribs': {
'value': 125
}
},
{
'key': 'Carl',
'doc_count': 33,
'total_contribs': {
'value': 30
}
}
]
}
}
}

expected_data = {
'contributor': ['Anne', 'Bob', 'Carl'],
'contributions': [179, 125, 30]
}

expected_df = pandas.DataFrame(
expected_data,
columns=['contributor', 'contributions'])

# Create a mocked Search
s, r = self.__create_mocked_search(response, create_search_mock)

# Mock `ignore_bots` to return our mocked `Search` object
ignore_bots_mock.return_value = s

# Test with start and end dates
#

start_date = '2018-01-01'
end_date = '2020-01-01'

result = esc.contributions_count_by_contributor(
DataSource.GIT,
start_date=start_date,
end_date=end_date)

create_search_mock.assert_called_with(
data_source=DataSource.GIT,
start_date=start_date,
end_date=end_date)
ignore_bots_mock.assert_called_with(
s=s)

s.aggs.bucket.assert_called_with(
'contributors',
'terms',
field='author_name',
order={'total_contribs': 'desc'},
size=10000)
s.aggs.bucket().metric.assert_called_with(
'total_contribs',
'cardinality',
field='hash',
precision_threshold=40000)

assert_frame_equal(result, expected_df)

# Test with start date only
#

result = esc.contributions_count_by_contributor(
DataSource.ALL,
start_date=start_date)

create_search_mock.assert_called_with(
data_source=DataSource.ALL,
start_date=start_date,
end_date=None)
ignore_bots_mock.assert_called_with(
s=s)

s.aggs.bucket.assert_called_with(
'contributors',
'terms',
field='author_name',
order={'total_contribs': 'desc'},
size=10000)
s.aggs.bucket().metric.assert_called_with(
'total_contribs',
'cardinality',
field='painless_unique_id',
precision_threshold=40000)

assert_frame_equal(result, expected_df)

# Test (not) ignoring bots and both dates
#

# Reset the `ignore_bots_mock` call number
ignore_bots_mock.reset_mock()

result = esc.contributions_count_by_contributor(
DataSource.GIT,
start_date=start_date,
end_date=end_date,
exclude_bots=False)

create_search_mock.assert_called_with(
data_source=DataSource.GIT,
start_date=start_date,
end_date=end_date)
ignore_bots_mock.assert_not_called()

s.aggs.bucket.assert_called_with(
'contributors',
'terms',
field='author_name',
order={'total_contribs': 'desc'},
size=10000)
s.aggs.bucket().metric.assert_called_with(
'total_contribs',
'cardinality',
field='hash',
precision_threshold=40000)

assert_frame_equal(result, expected_df)

# Test (not) ignoring bots and start date only
#

# Reset the `ignore_bots_mock` call number
ignore_bots_mock.reset_mock()

result = esc.contributions_count_by_contributor(
DataSource.GIT,
start_date=start_date,
exclude_bots=False)

create_search_mock.assert_called_with(
data_source=DataSource.GIT,
start_date=start_date,
end_date=None)
ignore_bots_mock.assert_not_called()

s.aggs.bucket.assert_called_with(
'contributors',
'terms',
field='author_name',
order={'total_contribs': 'desc'},
size=10000)
s.aggs.bucket().metric.assert_called_with(
'total_contribs',
'cardinality',
field='hash',
precision_threshold=40000)

assert_frame_equal(result, expected_df)

def __create_mocked_search(self, response, create_search_mock):
# Create a mocked Search
s = MagicMock()
Expand Down
Loading