#! /usr/bin/env python """ # ----------------------------------------------------------------------- # encoding: utf-8 # DIAMOnD.py # Joerg Menche, Susan D. Ghiassian # Last Modified: 2014-12-05 # This code runs the DIAMOnD algorithm as described in # # A DIseAse MOdule Detection (DIAMOnD) Algorithm derived from a # systematic analysis of connectivity patterns of disease proteins in # the Human Interactome # # by Susan Dina Ghiassian, Joerg Menche & Albert-Laszlo Barabasi # # # ----------------------------------------------------------------------- """ import time import cPickle import networkx as nx import numpy as np import copy import scipy.stats from collections import defaultdict import csv import sys # ============================================================================= def print_usage(): print ' ' print ' usage: ./DIAMOnD network_file seed_file n alpha(optional) outfile_name (optional)' print ' -----------------------------------------------------------------' print ' network_file : The edgelist must be provided as any delimiter-separated' print ' table. Make sure the delimiter does not exit in gene IDs' print ' and is consistent across the file.' print ' The first two columns of the table will be' print ' interpreted as an interaction gene1 <==> gene2' print ' seed_file : table containing the seed genes (if table contains' print ' more than one column they must be tab-separated;' print ' the first column will be used only)' print ' n : desired number of DIAMOnD genes, 200 is a reasonable' print ' starting point.' print ' alpha : an integer representing weight of the seeds,default' print ' value is set to 1' print ' outfile_name : results will be saved under this file name' print ' by default the outfile_name is set to "first_n_added_nodes_weight_alpha.txt"' print ' ' # ============================================================================= def check_input_style(input_list): try: network_edgelist_file = input_list[1] seeds_file = input_list[2] max_number_of_added_nodes = int(input_list[3]) # if no input is given, print out a usage message and exit except: print_usage() sys.exit(0) return alpha = 1 outfile_name = 'first_%d_added_nodes_weight_%d.txt'%(max_number_of_added_nodes,alpha) if len(input_list)==5: try: alpha = int(input_list[4]) outfile_name = 'first_%d_added_weight_%d.txt'%(max_number_of_added_nodes,alpha) except: outfile_name = input_list[4] if len(input_list)==6: try: alpha = int(input_list[4]) outfile_name = input_list[5] except: print_usage() sys.exit(0) return return network_edgelist_file,seeds_file,max_number_of_added_nodes,alpha,outfile_name # ============================================================================= def read_input(network_file,seed_file): """ Reads the network and the list of seed genes from external files. * The edgelist must be provided as a tab-separated table. The first two columns of the table will be interpreted as an interaction gene1 <==> gene2 * The seed genes mus be provided as a table. If the table has more than one column, they must be tab-separated. The first column will be used only. * Lines that start with '#' will be ignored in both cases """ sniffer = csv.Sniffer() line_delimiter = None for line in open(network_file,'r'): if line[0]=='#': continue else: dialect = sniffer.sniff(line) line_delimiter = dialect.delimiter break if line_delimiter == None: print 'network_file format not correct' sys.exit(0) # read the network: G = nx.Graph() for line in open(network_file,'r'): # lines starting with '#' will be ignored if line[0]=='#': continue # The first two columns in the line will be interpreted as an # interaction gene1 <=> gene2 #line_data = line.strip().split('\t') line_data = line.strip().split(line_delimiter) node1 = line_data[0] node2 = line_data[1] G.add_edge(node1,node2) # read the seed genes: seed_genes = set() for line in open(seed_file,'r'): # lines starting with '#' will be ignored if line[0]=='#': continue # the first column in the line will be interpreted as a seed # gene: line_data = line.strip().split('\t') seed_gene = line_data[0] seed_genes.add(seed_gene) return G,seed_genes # ================================================================================ def compute_all_gamma_ln(N): """ precomputes all logarithmic gammas """ gamma_ln = {} for i in range(1,N+1): gamma_ln[i] = scipy.special.gammaln(i) return gamma_ln # ============================================================================= def logchoose(n, k, gamma_ln): if n-k+1 <= 0: return scipy.infty lgn1 = gamma_ln[n+1] lgk1 = gamma_ln[k+1] lgnk1 = gamma_ln[n-k+1] return lgn1 - [lgnk1 + lgk1] # ============================================================================= def gauss_hypergeom(x, r, b, n, gamma_ln): return np.exp(logchoose(r, x, gamma_ln) + logchoose(b, n-x, gamma_ln) - logchoose(r+b, n, gamma_ln)) # ============================================================================= def pvalue(kb, k, N, s, gamma_ln): """ ------------------------------------------------------------------- Computes the p-value for a node that has kb out of k links to seeds, given that there's a total of s sees in a network of N nodes. p-val = \sum_{n=kb}^{k} HypergemetricPDF(n,k,N,s) ------------------------------------------------------------------- """ p = 0.0 for n in range(kb,k+1): if n > s: break prob = gauss_hypergeom(n, s, N-s, k, gamma_ln) # print prob p += prob if p > 1: return 1 else: return p # ============================================================================= def get_neighbors_and_degrees(G): neighbors,all_degrees = {},{} for node in G.nodes(): nn = set(G.neighbors(node)) neighbors[node] = nn all_degrees[node] = G.degree(node) return neighbors,all_degrees # ============================================================================= # Reduce number of calculations # ============================================================================= def reduce_not_in_cluster_nodes(all_degrees,neighbors,G,not_in_cluster,cluster_nodes,alpha): reduced_not_in_cluster = {} kb2k = defaultdict(dict) for node in not_in_cluster: k = all_degrees[node] kb = 0 # Going through all neighbors and counting the number of module neighbors for neighbor in neighbors[node]: if neighbor in cluster_nodes: kb += 1 #adding wights to the the edges connected to seeds k += (alpha-1)*kb kb += (alpha-1)*kb kb2k[kb][k] =node # Going to choose the node with largest kb, given k k2kb = defaultdict(dict) for kb,k2node in kb2k.iteritems(): min_k = min(k2node.keys()) node = k2node[min_k] k2kb[min_k][kb] = node for k,kb2node in k2kb.iteritems(): max_kb = max(kb2node.keys()) node = kb2node[max_kb] reduced_not_in_cluster[node] =(max_kb,k) return reduced_not_in_cluster #====================================================================================== # C O R E A L G O R I T H M #====================================================================================== def diamond_iteration_of_first_X_nodes(G,S,X,alpha): """ Parameters: ---------- - G: graph - S: seeds - X: the number of iterations, i.e only the first X gened will be pulled in - alpha: seeds weight Returns: -------- - added_nodes: ordered list of nodes in the order by which they are agglomerated. Each entry has 4 info: * name : dito * k : degree of the node * kb : number of +1 neighbors * p : p-value at agglomeration """ N = G.number_of_nodes() added_nodes = [] # ------------------------------------------------------------------ # Setting up dictionaries with all neighbor lists # and all degrees # ------------------------------------------------------------------ neighbors,all_degrees = get_neighbors_and_degrees(G) # ------------------------------------------------------------------ # Setting up initial set of nodes in cluster # ------------------------------------------------------------------ cluster_nodes = set(S) not_in_cluster = set() s0 = len(cluster_nodes) s0 += (alpha-1)*s0 N +=(alpha-1)*s0 # ------------------------------------------------------------------ # precompute the logarithmic gamma functions # ------------------------------------------------------------------ gamma_ln = compute_all_gamma_ln(N+1) # ------------------------------------------------------------------ # Setting initial set of nodes not in cluster # ------------------------------------------------------------------ for node in cluster_nodes: not_in_cluster |= neighbors[node] not_in_cluster -= cluster_nodes # ------------------------------------------------------------------ # # M A I N L O O P # # ------------------------------------------------------------------ all_p = {} while len(added_nodes) < X: # ------------------------------------------------------------------ # # Going through all nodes that are not in the cluster yet and # record k, kb and p # # ------------------------------------------------------------------ info = {} pmin = 10 next_node = 'nix' reduced_not_in_cluster = reduce_not_in_cluster_nodes(all_degrees, neighbors,G, not_in_cluster, cluster_nodes,alpha) for node,kbk in reduced_not_in_cluster.iteritems(): # Getting the p-value of this kb,k # combination and save it in all_p, so computing it only once! kb,k = kbk try: p = all_p[(k,kb,s0)] except KeyError: p = pvalue(kb, k, N, s0, gamma_ln) all_p[(k,kb,s0)] = p # recording the node with smallest p-value if p < pmin: pmin = p next_node = node info[node] = (k,kb,p) # --------------------------------------------------------------------- # Adding node with smallest p-value to the list of aaglomerated nodes # --------------------------------------------------------------------- added_nodes.append((next_node, info[next_node][0], info[next_node][1], info[next_node][2])) # Updating the list of cluster nodes and s0 cluster_nodes.add(next_node) s0 = len(cluster_nodes) not_in_cluster |= ( neighbors[next_node] - cluster_nodes ) not_in_cluster.remove(next_node) return added_nodes # =========================================================================== # # M A I N D I A M O n D A L G O R I T H M # # =========================================================================== def DIAMOnD(G_original,seed_genes,max_number_of_added_nodes,alpha,outfile = None): """ Runs the DIAMOnD algorithm Input: ------ - G_original : The network - seed_genes : a set of seed genes - max_number_of_added_nodes: after how many added nodes should the algorithm stop - alpha: given weight to the sees - outfile: filename for the output generates by the algorithm, if not given the program will name it 'first_x_added_nodes.txt' Returns: -------- - added_nodes: A list with 4 entries at each element: * name : name of the node * k : degree of the node * kb : number of neighbors that are part of the module (at agglomeration) * p : connectivity p-value at agglomeration - """ # 1. throwing away the seed genes that are not in the network all_genes_in_network = set(G_original.nodes()) seed_genes = set(seed_genes) disease_genes = seed_genes & all_genes_in_network if len(disease_genes) != len(seed_genes): print "DIAMOnD(): ignoring %s of %s seed genes that are not in the network" %( len(seed_genes - all_genes_in_network), len(seed_genes)) # 2. agglomeration algorithm. added_nodes = diamond_iteration_of_first_X_nodes(G_original, disease_genes, max_number_of_added_nodes,alpha) # 3. saving the results with open(outfile,'w') as fout: print>>fout,'\t'.join(['#rank','DIAMOnD_node']) rank = 0 for DIAMOnD_node_info in added_nodes: rank += 1 DIAMOnD_node = DIAMOnD_node_info[0] p = float(DIAMOnD_node_info[3]) print>>fout,'\t'.join(map(str,([rank,DIAMOnD_node]))) return added_nodes # =========================================================================== # # "Hey Ho, Let's go!" -- The Ramones (1976) # # =========================================================================== if __name__ == '__main__': # ----------------------------------------------------- # Checking for input from the command line: # ----------------------------------------------------- # # [1] file providing the network in the form of an edgelist # (tab-separated table, columns 1 & 2 will be used) # # [2] file with the seed genes (if table contains more than one # column they must be tab-separated; the first column will be # used only) # # [3] number of desired iterations # # [4] (optional) seeds weight (integer), default value is 1 # [5] (optional) name for the results file #check if input style is correct input_list = sys.argv network_edgelist_file,seeds_file,max_number_of_added_nodes,alpha,outfile_name= check_input_style(input_list) # read the network and the seed genes: G_original,seed_genes = read_input(network_edgelist_file,seeds_file) # run DIAMOnD added_nodes = DIAMOnD(G_original, seed_genes, max_number_of_added_nodes,alpha, outfile=outfile_name) print "\n results have been saved to '%s' \n" %outfile_name