Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: generate balanced New Tab schedule #1075

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ opentelemetry-sdk-extension-aws = "*"
opentelemetry-exporter-otlp-proto-grpc = "*"
opentelemetry-instrumentation-aiohttp-client = "==0.41b0"
opentelemetry-instrumentation-fastapi = "==0.41b0"
pymoo = "*"
numpy = "*"

[requires]
python_version = "3.8"
3,114 changes: 1,766 additions & 1,348 deletions Pipfile.lock

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions app/models/prospect_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from pydantic import BaseModel


class ProspectModel(BaseModel):
id: str
topic: str
publisher: str
url: str
quality_score: float
115 changes: 115 additions & 0 deletions app/rankers/new_tab_scheduler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
from typing import List, Dict

import numpy as np
from pymoo.algorithms.soo.nonconvex.ga import BGA
from pymoo.core.problem import Problem
from pymoo.operators.crossover.pntx import PointCrossover
from pymoo.optimize import minimize

from app.models.prospect_model import ProspectModel


class ArticleSelectionProblem(Problem):
def __init__(
self,
articles: List[ProspectModel],
topic_duplicate_limits: Dict[str, int],
topic_duplicate_limit_fallback: int,
publisher_duplicate_limit: int
):
n_items = len(articles)
self.P = np.array([a.quality_score for a in articles]) # profit

all_topics = sorted({a.topic for a in articles})
all_publishers = sorted({a.publisher for a in articles})

# Create topic matrix
topic_matrix = np.zeros((n_items, len(all_topics)))
for i, article in enumerate(articles):
topic_index = all_topics.index(article.topic)
topic_matrix[i, topic_index] = 1

# Create publisher matrix
publisher_matrix = np.zeros((n_items, len(all_publishers)))
for i, article in enumerate(articles):
publisher_index = all_publishers.index(article.publisher)
publisher_matrix[i, publisher_index] = 1

# Concatenate topic and publisher matrices
self.W = np.hstack((topic_matrix, publisher_matrix))
n_constraints = self.W.shape[1] # equal to len(all_topics) + len(all_publishers)

# Create a vector for duplicate limits
self.duplicate_limit = np.array(
[topic_duplicate_limits.get(topic, topic_duplicate_limit_fallback) for topic in all_topics] +
[publisher_duplicate_limit] * len(all_publishers)
)

super().__init__(n_var=n_items, n_obj=1, n_ieq_constr=n_constraints, xl=0, xu=1, vtype=bool)

def _evaluate(self, x, out, *args, **kwargs):
"""
@param x: Numpy array with shape (pop_size, n_var). This is a batch of potential solutions to be evaluated.
@param out["F"]: cost function values of shape (pop_size,)
@param out["G"]: constraint value array of shape (pop_size, n_constraints)
"""
out["F"] = -np.sum(self.P * x, axis=1)
out["G"] = np.dot(x, self.W) - self.duplicate_limit


class SchedulingGA(BGA):
def __init__(self):
super().__init__(
pop_size=50,
crossover=PointCrossover(n_points=2),
)


"""
Values below are the 75th percentile topic counts per day, scheduled on NEW_TAB_EN_US between 2023-06-01 and 2023-12-22.
"""
DEFAULT_TOPIC_LIMITS = {
'ENTERTAINMENT': 5,
'FOOD': 5,
'SCIENCE': 4,
'HEALTH_FITNESS': 4,
'SELF_IMPROVEMENT': 4,
'EDUCATION': 3,
'POLITICS': 3,
'TECHNOLOGY': 2,
'PERSONAL_FINANCE': 2,
'BUSINESS': 2,
'CAREER': 2,
'TRAVEL': 2,
'PARENTING': 2,
'SPORTS': 1,
}


def select_articles(
prospects: List[ProspectModel],
topic_duplicate_limits=None,
topic_duplicate_limit_fallback: int = 2,
publisher_duplicate_limit: int = 2,
n_gen: int = 100,
) -> List[ProspectModel]:
"""
@param topic_duplicate_limits: Dictionary of maximum number of items with the same topics to select
@param topic_duplicate_limit_fallback: Fallback maximum number duplicate topics, for topics not present in the dict
@param publisher_duplicate_limit: Maximum number of items to select with the same publisher
@param prospects: List of all available candidate items
@param n_gen: Number of generations
@return: a subset of prospects optimized for quality_score sum, with constraints on duplicate topics and publishers.
"""
if topic_duplicate_limits is None:
topic_duplicate_limits = DEFAULT_TOPIC_LIMITS

problem = ArticleSelectionProblem(
prospects,
topic_duplicate_limits=topic_duplicate_limits,
topic_duplicate_limit_fallback=topic_duplicate_limit_fallback,
publisher_duplicate_limit=publisher_duplicate_limit,
)
algorithm = SchedulingGA()
res = minimize(problem, algorithm, termination=('n_gen', n_gen))
return [article for article, x in zip(prospects, res.X) if float(x) > 0.5]
Loading