-
Notifications
You must be signed in to change notification settings - Fork 2
/
annotate_neurosynth.py
98 lines (83 loc) · 3.8 KB
/
annotate_neurosynth.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
"""
Due to bandwidth limitations, the web interface (https://neurosynth.org/)
is not intended to support mass downloading of hundreds or thousands of
images, and attempts to scrape content in an automated way will result
in permanent IP bans.
Source: check Data from https://neurosynth.org/code/
Therefore we download manually each html showing the max number (100) of
title under the tag 'studies' for each topic inside
https://neurosynth.org/analyses/topics/v5-topics-200/ for example.
Donwload all the files to out_dir/v5topic200
Download the root pages of the topics
(https://neurosynth.org/analyses/topics/v5-topics-200/) as:
- Neurosynth_v5topic200_00_topics.htm
- Neurosynth_v5topic200_01_topics.htm
- ...
Download the pages of each topic
(https://neurosynth.org/analyses/topics/v5-topics-200/0) as:
- Neurosynth_v5topic200_topic000_00.htm
- Neurosynth_v5topic200_topic000_01.htm
- ...
"""
import glob
import os
import nimare
import numpy as np
from bs4 import BeautifulSoup
from neurosynth.base.dataset import download
# Download Neurosynth
out_dir = os.path.abspath('/Users/jperaza/Desktop/neurosynth/')
if not os.path.isdir(out_dir):
os.mkdir(out_dir)
if not os.path.isfile(os.path.join(out_dir, 'database.txt')):
download(out_dir, unpack=True)
# Convert Neurosynth database files to NiMARE Dataset
dset = nimare.io.convert_neurosynth_to_dataset(
os.path.join(out_dir, 'database.txt'),
os.path.join(out_dir, 'features.txt'))
dset.save(os.path.join(out_dir, 'neurosynth_dataset.pkl.gz'))
TOPICS = 'v5topic200'
for topic in range(200):
files = glob.glob(os.path.join(
out_dir, TOPICS, 'Neurosynth_v5topic200_topic{:03d}_*.htm'.format(topic)))
files.sort()
n_topics = glob.glob(os.path.join(
out_dir, TOPICS, 'Neurosynth_v5topic200_*_topics.htm'))
n_topics.sort()
# Find the number of studies by topic to perform a final check
n_studies = []
for n_topic in n_topics:
with open(n_topic) as html_topics:
soup_topics = BeautifulSoup(html_topics, 'lxml')
studies_table = soup_topics.find('div', class_='row').find(
'div', class_='col-md-12 content').find_all('td')
[n_studies.append(int(studies_table[idx*3+2].text))
for idx in range(int(len(studies_table)/3))]
nimare_ids_weight = {}
for file in files:
with open(file) as html_topic:
soup_topic = BeautifulSoup(html_topic, 'lxml')
studies_table = soup_topic.find('div', class_='row').find(
'div', class_='row').find('div', class_='col-md-10 content').find(
'div', class_='tab-content').find('div', id='studies').table.tbody
studies = studies_table.find_all('a')
weights = studies_table.find_all('td', class_='sorting_1')
for i, study in enumerate(studies):
expid = '1'
pid = study['href'].split('/')[4]
nimare_ids_weight["{0}-{1}".format(pid, expid)] = weights[i].text
found_ids = np.isin(dset.ids, list(nimare_ids_weight.keys()))
ids_colum = np.zeros(len(dset.ids))
topic_weights = [nimare_ids_weight[id] for id in dset.ids[found_ids]]
ids_colum[found_ids] = topic_weights
# Doble check that all the studies are in the dataset and match the number of
# studies reported on https://neurosynth.org/analyses/topics/v5-topics-200/
nonzero = ids_colum[ids_colum != 0]
if len(nonzero) != n_studies[topic]:
print('Only {} out of {} studies found in topic {} from {}'.format(
len(nonzero), n_studies[topic], topic, TOPICS))
print('Check local html file')
# Add annotation to Dataset and save to file
dset.annotations['Neurosynth_{}__topic{:03d}'.format(
TOPICS, topic)] = ids_colum
dset.save(os.path.join(out_dir, 'neurosynth_dataset_annotation_test.pkl.gz'))