Skip to content

Commit

Permalink
Merge pull request #59 from dgraph-io/raphael/social
Browse files Browse the repository at this point in the history
add social data and generator
  • Loading branch information
rderbier authored Sep 25, 2024
2 parents d47fe8c + 46a5fbc commit e551566
Show file tree
Hide file tree
Showing 4 changed files with 194 additions and 0 deletions.
133 changes: 133 additions & 0 deletions data/social/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@

Dataset created to support the blog post about variable propagation on 'social' use cases.

https://dgraph.io/blog/post/20240923-variable-propgation/


The RDF file `contacts.rdf.gz` has been generated for 10000 users with the script generate.py.


Start the dgraph container with the following command

> docker run -it -d -p 8080:8080 -p 9080:9080 -v /path/to/dgraph-data:/dgraph --name dgraph-dev dgraph/standalone:latest
Copy the files to the mounted directory so that they are seen in Docker.

> cp contacts.rdf.gz <local path to /dgraph-data>
> cp contacts.schema <local path to /dgraph-data>
Use dgraph live command in the docker instance

> docker exec -it dgraph-dev dgraph live -c 1 -f /dgraph/contacts.rdf.gz -s /dgraph/contacts.schema
You cat get some usernames

```graphql
{
user(func:has(username), first:5) {
username
}
```

and test the queries from the blog post
## mutual 'follows'
```graphql
{

userA as var(func: eq(username, "barbara10")) {
# use a named variable userA to be able to exclude this node later in the query
c as math(1) # start c =1 on user A Node
follows_of_userA as follows {
# c is propagated, each follow is reached one time so c =1 for every follow
~follows @filter(NOT uid(userA)) {
# ~follows is the reverse relationship
# users at this level are reached by all the common follows,
# c = sum all path = count of common follows
# keep the value in a variable,
# in Dgraph a variable is a map uid -> value, so we have the count for every target
mutual_follows as math(c)
}
}
}

target_user(func: uid(mutual_follows), orderdesc: val(mutual_follows), first:1) {
username
mutual_follows_count: val(mutual_follows)
mutual_follows: follows @filter(uid(follows_of_userA)) {
username
}
}
}
```

## mutual 'contacts'
```graphql
{
var(func: eq(username, "barbara10")) {
c as math(1)
userA_phone_number as ~belongs_to {
userA_contacts as has_in_contacts {
~has_in_contacts @filter(NOT uid(userA_phone_number)) {
belongs_to{
mutual_contacts as Math(c)
}
}
}
}
}


target_user(func: uid(mutual_contacts), orderdesc: val(mutual_contacts), first: 1) {
username
mutual_contact_count:val(mutual_contacts)
phone:~belongs_to {
phone_number
mutual_contacts: has_in_contacts @filter(uid(userA_contacts)) {
phone_number
belongs_to {
username
}
}
}
}
}
```

## computing a complex score

```graphql
{
userA as var(func: eq(username, "barbara10")) {
c as math(1) # start c =1 on user A Node
# first block to compute mutual follows using variable propagation
follows {
~follows @filter(NOT uid(userA)) {
mutual_follows as math(c)
}
}
# second block to compute mutual contacts using same variable !
# different path.
userA_phone_number as ~belongs_to {
has_in_contacts {
~has_in_contacts @filter(NOT uid(userA_phone_number)) {
belongs_to{
mutual_contacts as Math(c)
}
}
}
}
}

# compute a score using the formula
var(func: uid(mutual_follows, mutual_contacts)) {
score as math(0.4 * mutual_follows + 0.6 * mutual_contacts)
}
# get target info
target(func: uid(score), orderdesc: val(score), first: 1) {
username
score: val(score)
count_mutual_follows: val(mutual_follows)
count_mutual_contacts: val (mutual_contacts)
}
}
```
3 changes: 3 additions & 0 deletions data/social/contacts.rdf.gz
Git LFS file not shown
5 changes: 5 additions & 0 deletions data/social/contacts.schema
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
phone_number: string @index(hash) .
username: string @index(hash) .
has_in_contacts: [uid] @reverse .
belongs_to: uid @reverse .
follows: [uid] @reverse .
53 changes: 53 additions & 0 deletions data/social/generate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/usr/bin/python

from faker import Faker
import pandas as pd
import gzip
import sys
from random import randint


faker = Faker()
def generate_data(size=500):
# return a dataframe with user_name and phone number
phones = [f'{faker.unique.msisdn()[4:]}' for i in range(size)]
names = [faker.unique.user_name() for i in range(size)]
df = pd.DataFrame({'user_name': names, 'phone_number': phones})
return df


def dataframe_to_rdf(data, filehandle = sys.stdout):
for _, row in data.iterrows():
# add users and phone numbers to the rdf file
rdf= ""
rdf += "<_:{}> <phone_number> \"{}\" .\n".format(row['phone_number'],row['phone_number'])
rdf += "<_:{}> <username> \"{}\" .\n".format(row['user_name'],row['user_name'])
rdf += "<_:{}> <belongs_to> <_:{}> .\n".format(row['phone_number'],row['user_name'])
# add follows relationship
# get a random number of people to follow from the dataframe
follows = data.sample(n=randint(5, 100))
for _, row_target in follows.iterrows():
if (row['user_name'] != row_target['user_name']):
rdf += "<_:{}> <follows> <_:{}> .\n".format(row['user_name'],row_target['user_name'])
# add contacts relationship
contacts = data.sample(n=randint(5, 100))
for _, row_target in contacts.iterrows():
if (row['phone_number'] != row_target['phone_number']):
rdf += "<_:{}> <has_in_contacts> <_:{}> .\n".format(row['phone_number'],row_target['phone_number'])
filehandle.write(rdf)
return


data = generate_data(10000)
# data.to_csv("products_with_embedding.csv.gz",index=False,compression='gzip',header=True)
# gzip file must use wt for write text
with gzip.open("./contacts.rdf.gz","wt") as f:
dataframe_to_rdf(data, f)


# ## load data set
# Start the dgraph container with the following command
# docker run -it -d -p 8080:8080 -p 9080:9080 -v /path/to/dgraph-data:/dgraph --name dgraph-v24 dgraph/standalone:latest
# cp contacts.rdf.gz <local path to /dgraph-data>
# cp contacts.schema <local path to /dgraph-data>
# docker exec -it dgraph-v24 dgraph live -c 1 -f /dgraph/contacts.rdf.gz -s /dgraph/contacts.schema

0 comments on commit e551566

Please sign in to comment.