Skip to content

Commit

Permalink
Update _pbg.py
Browse files Browse the repository at this point in the history
  • Loading branch information
JunxiFeng committed Oct 16, 2024
1 parent 25d2100 commit 3401347
Showing 1 changed file with 239 additions and 10 deletions.
249 changes: 239 additions & 10 deletions simba/tools/_pbg.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,19 @@ def gen_graph(list_CP=None,
list_PM=None,
list_PK=None,
list_PV=None,
list_VI=None,
list_CG=None,
list_CI=None,
list_CC=None,
list_PG=None,
list_adata=None,
prefix_C='C',
prefix_P='P',
prefix_M='M',
prefix_K='K',
prefix_V='V',
prefix_G='G',
prefix_I='I',
prefix='E',
layer='simba',
copy=False,
Expand Down Expand Up @@ -161,23 +165,30 @@ def gen_graph(list_CP=None,
list_PM,
list_PK,
list_PV,
list_VI,
list_CG,
list_CI,
list_CC,
list_adata]))) == 7:
list_PG,
list_adata]))) == 10:
return 'No graph is generated'
if get_marker_significance:
gen_graph(list_CP=list_CP,
list_PM=list_PM,
list_PK=list_PK,
list_PV=list_PV,
list_VI=list_VI,
list_PG=list_PG,
list_CG=list_CG,
list_CI=list_CI,
list_CC=list_CC,
prefix_C=prefix_C,
prefix_P=prefix_P,
prefix_M=prefix_M,
prefix_K=prefix_K,
prefix_V=prefix_V,
prefix_G=prefix_G,
prefix_I=prefix_I,
layer=layer,
copy=copy,
dirname=dirname,
Expand Down Expand Up @@ -360,7 +371,8 @@ def _get_df_edges(adj_mat, df_source, df_dest, adata, relation_id, include_weigh
ids_kmers = pd.Index([])
ids_motifs = pd.Index([])
ids_variants = pd.Index([])

ids_individuals = pd.Index([])

if list_CP is not None:
for adata_ori in list_CP:
if use_top_pcs_CP is None:
Expand Down Expand Up @@ -392,6 +404,7 @@ def _get_df_edges(adj_mat, df_source, df_dest, adata, relation_id, include_weigh
if get_marker_significance:
n_npeaks = min(int(len(ids_peaks)*fold_null_nodes), len(ids_peaks))
ids_npeaks = pd.Index([f'n{prefix_P}.{x}' for x in range(n_npeaks)])

if list_PM is not None:
for adata_ori in list_PM:
if use_top_pcs_PM is None:
Expand Down Expand Up @@ -437,8 +450,14 @@ def _get_df_edges(adj_mat, df_source, df_dest, adata, relation_id, include_weigh
if get_marker_significance:
n_nvariants = int(len(ids_variants)*fold_null_nodes)
ids_nvariants = pd.Index([f'n{prefix_V}.{x}' for x in range(n_nvariants)])

if list_PG is not None:
for adata_ori in list_PG:
adata = adata_ori.copy()
ids_peaks = ids_peaks.union(adata.obs.index)
ids_genes = ids_genes.union(adata.var.index)
if list_CG is not None:
for adata_ori in list_CG:
for adata_ori in tqdm(list_CG,desc="Processing CG"):
if use_highly_variable:
adata = adata_ori[
:, adata_ori.var['highly_variable']].copy()
Expand All @@ -465,6 +484,34 @@ def _get_df_edges(adj_mat, df_source, df_dest, adata, relation_id, include_weigh
if get_marker_significance:
n_ngenes = int(len(ids_genes)*fold_null_nodes)
ids_ngenes = pd.Index([f'n{prefix_G}.{x}' for x in range(n_ngenes)])

if list_CI is not None:
for adata_ori in tqdm(list_CI,desc="Processing CI"):
adata = adata_ori.copy()
ids_cells_i = adata.obs.index
if len(dict_cells) == 0:
dict_cells[prefix_C] = ids_cells_i
else:
# check if cell indices are included in dict_cells
flag_included = False
for k in dict_cells.keys():
ids_cells_k = dict_cells[k]
if set(ids_cells_i) <= set(ids_cells_k):
flag_included = True
break
if not flag_included:
# create a new set of entities
# when not all indices are included
dict_cells[
f'{prefix_C}{len(dict_cells)+1}'] = \
ids_cells_i
ids_individuals = ids_individuals.union(adata.var.index)

if list_VI is not None:
for adata_ori in tqdm(list_VI,desc="Processing VI"):
adata = adata_ori.copy()
ids_variants = ids_variants.union(adata.obs.index)
ids_individuals = ids_individuals.union(adata.var.index)

entity_alias = pd.DataFrame(columns=['alias'])
dict_df_cells = dict() # unique cell dataframes
Expand Down Expand Up @@ -494,6 +541,16 @@ def _get_df_edges(adj_mat, df_source, df_dest, adata, relation_id, include_weigh
settings.pbg_params['entities'][f'n{prefix_G}'] = {'num_partitions': 1}
entity_alias = pd.concat([entity_alias, df_ngenes],
ignore_index=False)
if len(ids_individuals) > 0:
df_individuals = pd.DataFrame(
index=ids_individuals,
columns=['alias'],
data=[f'{prefix_I}.{x}' for x in range(len(ids_individuals))])
settings.pbg_params['entities'][prefix_I] = {'num_partitions': 1}
entity_alias = pd.concat(
[entity_alias, df_individuals],
ignore_index=False)

if len(ids_peaks) > 0:
df_peaks = pd.DataFrame(
index=ids_peaks,
Expand Down Expand Up @@ -662,6 +719,61 @@ def _get_df_edges(adj_mat, df_source, df_dest, adata, relation_id, include_weigh
df_edges = pd.concat(
[df_edges, df_edges_x],
ignore_index=True)

if list_CI is not None:
for i, adata_ori in enumerate(list_CI):
adata = adata_ori.copy()
# select reference of cells
for key, df_cells in dict_df_cells.items():
if set(adata.obs_names) <= set(df_cells.index):
break
if layer is not None:
if layer in adata.layers.keys():
arr_simba = adata.layers[layer]
else:
print(f'`{layer}` does not exist in anndata {i} '
'in `list_CI`.`.X` is being used instead.')
arr_simba = adata.X
else:
arr_simba = adata.X
_row, _col = arr_simba.nonzero()
df_edges_x = pd.DataFrame(columns=col_names)

df_edges_x['source'] = df_cells.loc[
adata.obs_names[_row], 'alias'].values
df_edges_x['relation'] = f'r{id_r}'
df_edges_x['destination'] = df_individuals.loc[
adata.var_names[_col], 'alias'].values
if add_edge_weights:
df_edges_x['weight'] = \
arr_simba[_row, _col].A.flatten()
settings.pbg_params['relations'].append({
'name': f'r{id_r}',
'lhs': f'{key}',
'rhs': f'{prefix_I}',
'operator': 'none',
'weight': 1.0
})
dict_graph_stats[f'relation{id_r}'] = {
'source': key,
'destination': prefix_I,
'n_edges': df_edges_x.shape[0]}
print(
f'relation{id_r}: '
f'source: {key}, '
f'destination: {prefix_I}\n'
f'#edges: {df_edges_x.shape[0]}')
id_r += 1
df_edges = pd.concat(
[df_edges, df_edges_x],
ignore_index=True)
adata_ori.obs['pbg_id'] = ""
adata_ori.var['pbg_id'] = ""
adata_ori.obs.loc[adata.obs_names, 'pbg_id'] = \
df_cells.loc[adata.obs_names, 'alias'].copy()
adata_ori.var.loc[adata.var_names, 'pbg_id'] = \
df_individuals.loc[adata.var_names, 'alias'].copy()


if list_PM is not None:
for i, adata_ori in enumerate(list_PM):
Expand Down Expand Up @@ -929,24 +1041,141 @@ def _get_df_edges(adj_mat, df_source, df_dest, adata, relation_id, include_weigh
null_matrix.transpose()[_col, _row].A.flatten()
settings.pbg_params['relations'].append({
'name': f'r{id_r}',
'lhs': f'n{prefix_V}',
'rhs': f'{prefix_P}',
'operator': 'fix',
'lhs': f'n{prefix_P}',
'rhs': f'{prefix_V}',
'operator': 'none',
'weight': 1.0,
})
print(
f'relation{id_r}: '
f'source: n{prefix_V}, '
f'destination: {prefix_P}\n'
f'source: n{prefix_P}, '
f'destination: {prefix_V}\n'
f'#edges: {df_edges_x.shape[0]}')
dict_graph_stats[f'relation{id_r}'] = {
'source': f'n{prefix_V}',
'destination': prefix_P,
'source': f'n{prefix_P}',
'destination': prefix_V,
'n_edges': df_edges_x.shape[0]}
id_r += 1
df_edges = pd.concat(
[df_edges, df_edges_x],
ignore_index=True)

if list_VI is not None:
for i, adata_ori in enumerate(list_VI):
adata = adata_ori.copy()
if layer is not None:
if layer in adata.layers.keys():
arr_simba = adata.layers[layer]
else:
print(f'`{layer}` does not exist in anndata {i} '
'in `list_VI`.`.X` is being used instead.')
arr_simba = adata.X
else:
arr_simba = adata.X
_row, _col = arr_simba.nonzero()
df_edges_x = pd.DataFrame(columns=col_names)
df_edges_x['source'] = df_variants.loc[
adata.obs_names[_row], 'alias'].values
df_edges_x['relation'] = f'r{id_r}'
df_edges_x['destination'] = df_individuals.loc[
adata.var_names[_col], 'alias'].values
if add_edge_weights:
df_edges_x['weight'] = \
arr_simba[_row, _col].A.flatten()
settings.pbg_params['relations'].append({
'name': f'r{id_r}',
'lhs': f'{prefix_V}',
'rhs': f'{prefix_I}',
'operator': 'none',
'weight': 0.2
})
else:
settings.pbg_params['relations'].append({
'name': f'r{id_r}',
'lhs': f'{prefix_V}',
'rhs': f'{prefix_I}',
'operator': 'none',
'weight': 0.2
})
dict_graph_stats[f'relation{id_r}'] = {
'source': prefix_V,
'destination': prefix_I,
'n_edges': df_edges_x.shape[0]}
print(
f'relation{id_r}: '
f'source: {prefix_V}, '
f'destination: {prefix_I}\n'
f'#edges: {df_edges_x.shape[0]}')

id_r += 1
df_edges = pd.concat(
[df_edges, df_edges_x],
ignore_index=True)
adata_ori.obs['pbg_id'] = ""
adata_ori.var['pbg_id'] = ""
adata_ori.obs.loc[adata.obs_names, 'pbg_id'] = \
df_variants.loc[adata.obs_names, 'alias'].copy()
adata_ori.var.loc[adata.var_names, 'pbg_id'] = \
df_individuals.loc[adata.var_names, 'alias'].copy()

if list_PG is not None:
for i, adata_ori in enumerate(list_PG):
if use_top_pcs:
adata = adata_ori.copy()
if layer is not None:
if layer in adata.layers.keys():
arr_simba = adata.layers[layer]
else:
print(f'`{layer}` does not exist in anndata {i} '
'in `list_PG`.`.X` is being used instead.')
arr_simba = adata.X
else:
arr_simba = adata.X
_row, _col = arr_simba.nonzero()
df_edges_x = pd.DataFrame(columns=col_names)
df_edges_x['destination'] = df_genes.loc[
adata.var_names[_col], 'alias'].values
df_edges_x['relation'] = f'r{id_r}'
df_edges_x['source'] = df_peaks.loc[
adata.obs_names[_row], 'alias'].values
if add_edge_weights:
df_edges_x['weight'] = \
arr_simba[_row, _col].A.flatten()
settings.pbg_params['relations'].append({
'name': f'r{id_r}',
'lhs': f'{prefix_P}',
'rhs': f'{prefix_G}',
'operator': 'none',
'weight': 0.2
})
else:
settings.pbg_params['relations'].append({
'name': f'r{id_r}',
'lhs': f'{prefix_P}',
'rhs': f'{prefix_G}',
'operator': 'none',
'weight': 0.2
})
dict_graph_stats[f'relation{id_r}'] = {
'source': prefix_P,
'destination': prefix_G,
'n_edges': df_edges_x.shape[0]}
print(
f'relation{id_r}: '
f'source: {prefix_P}, '
f'destination: {prefix_G}\n'
f'#edges: {df_edges_x.shape[0]}')

id_r += 1
df_edges = pd.concat(
[df_edges, df_edges_x],
ignore_index=True)
adata_ori.obs['pbg_id'] = ""
adata_ori.var['pbg_id'] = ""
adata_ori.obs.loc[adata.obs_names, 'pbg_id'] = \
df_peaks.loc[adata.obs_names, 'alias'].copy()
adata_ori.var.loc[adata.var_names, 'pbg_id'] = \
df_genes.loc[adata.var_names, 'alias'].copy()

if list_CG is not None:
for i, adata_ori in enumerate(list_CG):
Expand Down

0 comments on commit 3401347

Please sign in to comment.