Monday, 24 June 2013

Browsing the NCBI Taxonomy with Python

The Taxonomy browser is a fantastic tool to browser the Tree of Life:
http://www.ncbi.nlm.nih.gov/taxonomy
http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Root


I wrote a simple script in Python to browse it.

First, download the taxonomy archive in your folder:
ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz

Unpack it:
tar xvfz taxdump.tar.gz

It will output the following files:
citations.dmp
delnodes.dmp
division.dmp
gc.prt
gencode.dmp
merged.dmp
names.dmp
nodes.dmp
readme.txt


Only "names.dmp and "nodes.dmp" are important for us.

Download the script from here and save it in the same folder:
parse_taxbrowser.py


Or copy it and save it as "parse_taxbrowser.py" in the same folder:

#!/usr/bin/python

import os
import sys

# Definition of the classe Node
class Node:
    """Noeud"""
    def __init__(self):
        self.tax_id = 0       # Number of the tax id.
        self.parent = 0       # Number of the parent of this node
        self.children = []    # List of the children of this node
        self.tip = 0          # Tip=1 if it's a terminal node, 0 if not.
        self.name = ""        # Name of the node: taxa if it's a terminal node, numero if not.      
    def genealogy(self):      # Trace genealogy from root to leaf
        ancestors = []        # Initialise the list of all nodes from root to leaf.
        tax_id = self.tax_id  # Define leaf
        while 1:
            if name_object.has_key(tax_id):
                ancestors.append(tax_id)
                tax_id = name_object[tax_id].parent
            else:
                break
            if tax_id == "1":
                # If it is the root, we reached the end.
                # Add it to the list and break the loop
                ancestors.append(tax_id)
                break
        return ancestors # Return the list

# Function to find common ancestor between two nodes or more
def common_ancestor(node_list):
    global name_object
    list1 = name_object[node_list[0]].genealogy()  # Define the whole genealogy of the first node
    for node in node_list:
        list2 = name_object[node].genealogy()      # Define the whole genealogy of the second node
        ancestral_list = []                            
        for i in list1:
            if i in list2:                         # Identify common nodes between the two genealogy
                ancestral_list.append(i)                
        list1 = ancestral_list                     # Reassing ancestral_list to list 1.
    common_ancestor = ancestral_list[0]            # Finally, the first node of the ancestra_list is the common ancestor of all nodes.
    return common_ancestor                         # Return a node
def all_descendant(self): # Find all children from a node
        terminal_nodes  = []
        list_descendant = []
        list_descendant.append(self.tax_id)
        for i in list_descendant:
            #print i
            if name_object[i].tip == 1:
                terminal_nodes.append(i)
            else:
                for j in name_object[i].children:
                    list_descendant.append(j)
        return terminal_nodes # Return a list


#############################
#                           #
#   Read taxonomy files     #
#                           #
#############################

######################
#
# Load names defintion

name_dict = {}          # Initialise dictionary with TAX_ID:NAME
name_dict_reverse = {}  # Initialise dictionary with NAME:TAX_ID

# Load  NCBI names file ("names.dmp")
name_file =  open("names.dmp","r")
while 1:
    line = name_file.readline()
    if line == "":
        break
    line = line.rstrip()
    line = line.replace("\t","")
    tab = line.split("|")
    if tab[3] == "scientific name":
        tax_id, name = tab[0], tab[1]     # Assign tax_id and name ...
        name_dict[tax_id] = name          # ... and load them
        name_dict_reverse[name] = tax_id  # ... into dictionaries
name_file.close()


######################
#
# Load taxonomy

# Define taxonomy variable
global name_object
name_object = {}


# Load taxonomy NCBI file ("nodes.dmp")
taxonomy_file = open("nodes.dmp","r")
while 1:
    line = taxonomy_file.readline()
    if line == "":
        break
    #print line
    line = line.replace("\t","")
    tab = line.split("|")
   
    tax_id = str(tab[0])
    tax_id_parent = str(tab[1])
    division = str(tab[4])

    # Define name of the taxid
    name = "unknown"
    if tax_id in name_dict:
        name = name_dict[tax_id]
   
    if not name_object.has_key(tax_id):
        name_object[tax_id] = Node()
    name_object[tax_id].tax_id   = tax_id        # Assign tax_id
    name_object[tax_id].parent   = tax_id_parent # Assign tax_id parent
    name_object[tax_id].name     = name          # Assign name
   
    if  tax_id_parent in name_object:
        children = name_object[tax_id_parent].children  # If parent is is already in the object
        children.append(tax_id)                  # ...we found its children.
        name_object[tax_id_parent].children = children  # ... so add them to the parent
taxonomy_file.close()



#################################################
#                                               #
#    Exemple 1 : Evolutionary history of human  #
#                                               #
#################################################

# But what is the tax_id of Human ???
tax_id_human = name_dict_reverse["Homo sapiens"]

print "Tax_id of Human (Homo sapiens) is: ", tax_id_human

# Ok, now define human genealogy...
human_genealogy = name_object[tax_id_human].genealogy()

#... and display it, with tax_id and name
for tax_id in human_genealogy:
    print  "Name: ", name_object[tax_id].name, " Tax_id: ",tax_id


##################################################################
#                                                                #
#    Exemple 2 : Common ancestor  between Trichoplax and human   #
#                                                                #
##################################################################

# What is the common ancestor between Trichoplax and Human?

# Trichoplax: http://en.wikipedia.org/wiki/Trichoplax

# Define the two nodes and add them to a list
tax_id_1 = name_dict_reverse["Trichoplax adhaerens"]
tax_id_2 = name_dict_reverse["Homo sapiens"]
list_of_nodes = [tax_id_1, tax_id_2]


# Identify the common ancestor
common_ancestor = common_ancestor(list_of_nodes)

print "The common ancestor between ",name_object[tax_id_1].name, " and ",name_object[tax_id_2].name, " is: ", name_object[common_ancestor].name


Then makes it executable:
chmod +x parse_taxbrowser.py

Then execute it:
./parse_taxbrowser.py

And enjoy!

It will do four things:
1) Load the assocation between Tax_id and Names ("names.dmp")
2) Load the Tree of Life ("nodes.dmp")
3) Display the genealogy from root to Human
4) Find the common ancestor between Trichoplax and Human.





6 comments:

  1. Thank you Romain for sharing, is it possible to stop by descendent level, like fetching all first level descedent taxids by a parent taxid ?

    ReplyDelete
  2. Hi Walid,

    - If you want the direct descendant of node, you can directly ask with:
    name_object[tax_id].children

    - If you want all terminal leaves of a node, you can use this function I add in the main post:
    name_object[tax_id].all_descendant()


    Cheers,
    Romain

    ReplyDelete
  3. Romain, thank you for sharing a code. Is there any easy way to retrieve the rank name for the parents of the given node? Let's say I want a "phylum" rank for a species with taxid 444103.

    ReplyDelete
    Replies
    1. Hi,

      I am not exactly sure if this is what you want, but you can try to modify the script by :

      1) In the class Node, add the line:
      self.rank = "" # Rank of the node

      2) In the parsing of the files "node.dmp", modify as follow:
      tax_id = str(tab[0])
      tax_id_parent = str(tab[1])
      rank = str(tab[2])
      division = str(tab[4])

      3) Add this code at the end of the script:

      tax_id = "444103"
      while 1:
      parent = name_object[tax_id].parent
      print("The parent of ",name_object[tax_id].name, "[TaxID:", tax_id, "] is ", name_object[parent].name, "[TaxID:", parent, "], and " \
      " its rank is ", name_object[parent].rank)
      tax_id = parent
      if tax_id == "1":
      break


      Does it work for you?

      Romain

      Delete
    2. This comment has been removed by the author.

      Delete
    3. Sorry, it is a bit annoying to post code in the answer box, as it remove the formatting. You will have to put correct spaces in the while loop.

      Delete