{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Blast and GOSlim annotation of *Porites astreoides* transcriptome " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This workflow details the annotation of a *Porites astreoides* [transcriptome](https://dl.dropboxusercontent.com/u/37523721/pastreoides_transcriptome_july2014.zip)\n", "\n", "The notebook requires you have the following \n", "- [NCBI Blast: 2.2.3](ftp://ftp.ncbi.nlm.nih.gov/blast/executables/LATEST/)\n", "- [SQLShare](https://sqlshare.escience.washington.edu/accounts/login/?next=/sqlshare/%3F__hash__%3D)\n", "\n", "The annotation also requires a Uniprot/Swissprot BLAST database. Instructions for setting up this database can be found [here](https://github.com/jldimond/Coral-CpG-ratio-MS/blob/master/README.md)\n", "\n", "The orginal analysis was carried out on on Mac OS X v10.10.3 running Python: 2.7.9 and IPython: 3.1.0.\n", "\n", "This workflow is structured so that anyone can reproduce the analysis by downloading the repository locally and executing." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/jd/Documents/Projects/Coral-CpG-ratio-MS/data/Past\n" ] } ], "source": [ "cd ../data/Past" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", "100 13.0M 100 13.0M 0 0 2373k 0 0:00:05 0:00:05 --:--:-- 3551k\n" ] } ], "source": [ "#Obtain FASTA file\n", "!curl -O https://dl.dropboxusercontent.com/u/37523721/pastreoides_transcriptome_july2014.zip" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Archive: pastreoides_transcriptome_july2014.zip\n", " inflating: addGO_manually .R \n", " creating: __MACOSX/\n", " inflating: __MACOSX/._addGO_manually .R \n", " inflating: past.fasta \n", " inflating: past_addGenes.tab \n", " inflating: past_CDS.fas \n", " inflating: past_defog_iso2go.tab \n", " inflating: __MACOSX/._past_defog_iso2go.tab \n", " inflating: past_defog_iso2kogClass.tab \n", " inflating: past_iso2gene.tab \n", " inflating: past_iso2go.tab \n", " inflating: past_iso2kegg.tab \n", " inflating: past_iso2kogClass.tab \n", " inflating: past_notes.txt \n", " inflating: __MACOSX/._past_notes.txt \n", " inflating: past_PRO.fas \n", " inflating: past_seq2iso.tab \n" ] } ], "source": [ "!unzip -o pastreoides_transcriptome_july2014.zip" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ">GCKDGN101CF7JK gene=GCKDGN101CF7JK\r\n", "GCAGTTCACATGACCCGAGGCTAGGAGATCCCATGAGATAGCACAGTGCAAGCGTGACTCATTGCACATGTGCTTGCAACGCGACTGGCGTCTTCTCGACGAGCTCCTGAGTATTTTCACAAGATTGGCCCCTATTTCTCAATGCCGTTGGGAAAATTATTGGGCCGTAGAAAACAGAATATCACACTAATGTAGTTGGTAGGTTAGCAAAACCAGTACCAGC\r\n", ">GCKDGN101CAZ1A gene=GCKDGN101CAZ1A Gene=\"Defense-related protein containing SCP domain ; pathogenesis-related protein 1 E=1e-13\"\r\n", "GCAGTTGACTGTCATTTCCCAGGGGACGATACAGTGCCACCACAATGACGTACTTACCGCTGGGGCTTGGCGCTGCCCCCATGCCTAGCTGATTGGTGTCTCTCCATATCATCTGTGTGAAGTGCTTTACTTTGTCATTGAGAACTGGATAGGAGAAGCTGTACGCTTTCTTCTCGCCGTACCATATAGTCGTGGCCTTGAGAGGTGCTCTCTGCAGATCGTGCCATATCTGTGCAATGTTTTCACCCGCTGGCTCGCCTTGGAAAGTGGACGGGTCACTGGCTACAGACTCTGCTATATTCTGCGCTTGTTCTGCTAGAGTGGCATTCCATTGTAAAGGGNGGACGTGATGAAGTGATCTGA\r\n", ">GCKDGN101ANI4S gene=GCKDGN101ANI4S Gene=\"Pterin carbinolamine dehydratase PCBD/dimerization cofactor of HNF1\"\r\n", "TGTCCTGTCTTTCTTCGTTGTTCTTCTTGACCTGCTATACACATTAAACCACTCTGGGTGATGATCCATCTTCTCCGACATGAGAGCCACGCGGCTCATGAAACCAAATGCCTGGTTGAAATTCTTGAACTTAAACTCTTTGTAAATGGCATCACGGCCTTCGACATCAGTCCACCCTGAGGACTTCAGAGGCTGAACCTCTGTTTCCCTTTCC\r\n", ">GCKDGN101BV6U3 gene=GCKDGN101BV6U3 Gene=\"Chondroitin 6-sulfotransferase and related sulfotransferases ; dermatan 4-sulfotransferase 1 [EC:2.8.2.-] E=5e-17\"\r\n", "GCAGTATAGTCGTAGTTGATAGCGCATGGGTAGCATGTCTGGTGCGCAGGGCGCCAGTGCATATCCCACATAAACTTATTTGGCGATGTGTCAATGTATGACGTAAATTCGTCAAACGTTGGCCCAGAACACTCAACGCAACTGTCTGTTTTTCCCGCAGAACGACGAACCTTATTGACTATATCTCTGCCTAGCTTATTATACAATTGAGTCGTTTTTCCGACGAACTTGTCCCTATAGGCTGATAACAGTCTTTCGAAAGGTTCCCAAACAAAGA\r\n", ">GCKDGN101BQG8Q gene=GCKDGN101BQG8Q\r\n", "GCAGTATAACAGTAAATTGTTTCTACTCTCCACTTACCAAAGGAAACCATCAATATGACTGCCGTTTCTTTCTTTCAGTTCCATGTCTCGGTTACATTGAGCGCAGATCAGCGGGTTTGCCAAAAGAGATCGTTGTTGAAGCCACTGAATCAAATTGAGGCGTGGGCCCCGTTGCAAATCAATACATTCGTAAAGATCCATCTCCTTTATCATACGATGGTGCAAACGTCACTTAAATCGACTACCAAAATAGGGAGTAAAAACAGACCAGCTTATTCGAGGGTGGAGAAGTCCTATTAATCAACAATTCTTATGTATATGCGTAAGAGAAGACCGACTCCAGTGAACTACGTTTTGATTGGTTGAGGGTGATGTGGTGAAATAATTCTCATGTGAGCTCGTTTTGTTATCA\r\n" ] } ], "source": [ "!head past.fasta" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ">isotig29249 gene=isogroup27517\r\n", "TAGGAAGTGCTTTGAAACAAGCTATAGCAGTGCCAATACAATACAGTCGAACCTGtATTAAGCGGACACCCTCTATTAAGCGGACAGTAGCCGAAGTCCCAAAATTaCTTGAAATGAAACCTTTATtAAGCGGACACCTCTACTAAGCGGACGCGGACACCTAAAAaGTACCTGAAATGGTCATTTCTATtGTTTCCAACCTGTATTAAACGGACACTTGTAATTAAATTCCACCACCCAACgTgctAG\r\n", ">isotig32730 gene=isogroup30998 Gene=\"Predicted ATPase\"\r\n", "AaTGGaCACGcTtCTTGTTTAAGACAGGAATACTGtCaTAAAAGATGTCCATTAaCATGGTTTTTCCACTTCCcAcTCCTCCGTGGAGATAGaGTCCCTGAGgTGCTTCCTGTGGTTGTGAat\r\n", ">isotig30623 gene=isogroup28891 Gene=\"Sodium-neurotransmitter symporter ; solute carrier family 6 (neurotransmitter transporter, GABA) member 1 E=2e-37\"\r\n", "AACAAAGCCCAGCCAAGAATCACCGTGTAGTAAATGTTGATATACTGCATTACAAGCACACCGGCATAGCCAATTCCTtGGAAAagTGGACAGATTTtCCAAGCCGTGaTTCcACcTTgaCTCGTATATTGCCCCAGACCaatctccatcactaaaacgggtacaccagccacaatcaaacaaagcaagtagggaatgagaaaacaacc\r\n", ">isotig30838 gene=isogroup29106\r\n", "ACAGTCTGACTGTTGGAAGTGTGAAaGTTGATTTaCTTTGTAGAACTTTGCACTTGTAAGcctGTTTGTCatgtcttatgtgagaatttaaaggggctgtgtcatggatgtcaagttcattttctcaacattgcaaattatgcttccttatgcgctacagaacttaacaatagcgaaaagaaattaccaaatgataaaaca\r\n", ">isotig32258 gene=isogroup30526\r\n", "GCAGTTTGAAGTGTAATTTTAATATCATTTTtGGCTAAACTCTAGCTCCAGGCTCCTCTCTGTGCAATACCTCGACAACAAAGTTACCTGCATGAACTGAAAACTCTATTATAGTAGAGTTCATGGTCAAACTCAGTTGGGAGCA\r\n" ] } ], "source": [ "!tail past.fasta" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "30740\r\n" ] } ], "source": [ "!fgrep -c \">\" past.fasta" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Blastx query" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!blastx \\\n", "-query past.fasta \\ #FASTA file\n", "-db ~blast/db/uniprot_sprot \\ #Use your blastx database address\n", "-max_target_seqs 1 \\ #maximum number of target sequences = 1\n", "-max_hsps 1 \\ #maximum number of high-scoring pairs = 1\n", "-outfmt 6 \\ #output format = tabular\n", "-evalue 1E-05 \\ #E-value = 10^-5\n", "-num_threads 8 \\ #number of threads = 8\n", "-out ../analyses/Past/Past_blastx_uniprot.tab \\ #Direct output to analyses directory\n", "2> ../analyses/Past/Past_blastx_uniprot.error #Direct standard error output to its own file" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/jd/Documents/Projects/Coral-CpG-ratio-MS/analyses/Past\n" ] } ], "source": [ "cd ../../analyses/Past" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "GCKDGN101CAZ1A\tsp|Q9CYL5|GAPR1_MOUSE\t34.43\t122\t70\t4\t361\t14\t21\t138\t2e-14\t70.1\r\n", "GCKDGN101ANI4S\tsp|P0C8L6|PHS_HYPDU\t71.19\t59\t17\t0\t213\t37\t9\t67\t4e-26\t97.4\r\n", "GCKDGN101BV6U3\tsp|Q7L1S5|CHST9_HUMAN\t38.46\t91\t55\t1\t275\t3\t278\t367\t3e-14\t71.2\r\n", "GCKDGN101CG9W2\tsp|Q9NQX0|PRDM6_HUMAN\t50.59\t85\t39\t2\t252\t7\t247\t331\t3e-22\t94.0\r\n", "GCKDGN101CINM3\tsp|O43913|ORC5_HUMAN\t60.19\t103\t37\t1\t5\t313\t233\t331\t1e-35\t 131\r\n", "GCKDGN101BXQES\tsp|P42690|SRK4_SPOLA\t79.01\t81\t17\t0\t303\t61\t289\t369\t3e-40\t 144\r\n", "GCKDGN101A8LTX\tsp|Q8NDG6|TDRD9_HUMAN\t39.73\t73\t41\t2\t100\t312\t682\t753\t4e-09\t58.2\r\n", "GCKDGN101A9IY8\tsp|Q9BY08|EBPL_HUMAN\t39.34\t61\t36\t1\t237\t55\t83\t142\t7e-06\t47.0\r\n", "GCKDGN101A9QGN\tsp|O42430|CP1A1_LIMLI\t50.41\t123\t56\t2\t358\t2\t342\t463\t3e-36\t 133\r\n", "GCKDGN101B9AAV\tsp|Q92979|NEP1_HUMAN\t64.58\t48\t17\t0\t144\t1\t37\t84\t6e-15\t69.3\r\n" ] } ], "source": [ "!head -10 Past_blastx_uniprot.tab" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "isotig28863\tsp|Q8BTJ4|ENPP4_MOUSE\t55.36\t56\t25\t0\t176\t9\t337\t392\t6e-18\t81.3\r\n", "isotig27760\tsp|G8HTB6|ZPP_ACRMI\t55.00\t80\t34\t1\t245\t6\t65\t142\t1e-20\t88.6\r\n", "isotig29504\tsp|Q9VAC5|ADA17_DROME\t60.26\t78\t30\t1\t231\t1\t338\t415\t2e-26\t 105\r\n", "isotig26983\tsp|Q8K010|OPLA_MOUSE\t71.59\t88\t25\t0\t295\t32\t1017\t1104\t4e-36\t 135\r\n", "isotig28666\tsp|Q5ZL38|SGF29_CHICK\t80.00\t30\t6\t0\t263\t174\t264\t293\t2e-09\t56.2\r\n", "isotig27604\tsp|P38795|NADE_YEAST\t67.42\t89\t29\t0\t278\t12\t175\t263\t2e-41\t 148\r\n", "isotig28804\tsp|Q6DCP1|FXRD1_XENLA\t42.86\t63\t31\t3\t31\t213\t327\t386\t2e-06\t48.5\r\n", "isotig29116\tsp|Q6ZQ88|KDM1A_MOUSE\t71.08\t83\t23\t1\t251\t3\t259\t340\t7e-26\t 105\r\n", "isotig32730\tsp|P32317|AFG1_YEAST\t65.71\t35\t11\t1\t103\t2\t119\t153\t2e-08\t52.4\r\n", "isotig30623\tsp|A5PJX7|S6A13_BOVIN\t60.00\t70\t27\t1\t209\t3\t68\t137\t3e-21\t90.5\r\n" ] } ], "source": [ "!tail -10 Past_blastx_uniprot.tab" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 13789 165468 1031497 /Users/jd/Documents/Projects/Coral-CpG-ratio-MS/analyses/Past/Past_blastx_uniprot.tab\r\n" ] } ], "source": [ "!wc Past_blastx_uniprot.tab" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "GCKDGN101CAZ1A\tsp|Q9CYL5|GAPR1_MOUSE\t34.43\t122\t70\t4\t361\t14\t21\t138\t2e-14\t70.1\n", "SQLShare ready version has Pipes converted to Tabs ....\n", "GCKDGN101CAZ1A\tsp\tQ9CYL5\tGAPR1_MOUSE\t34.43\t122\t70\t4\t361\t14\t21\t138\t2e-14\t70.1\n" ] } ], "source": [ "#Removing pipes and converted to tab-delimited file\n", "!tr '|' \"\\t\" Past_blastx_uniprot.sql.tab\n", "!head -1 Past_blastx_uniprot.tab\n", "!echo SQLShare ready version has Pipes converted to Tabs ....\n", "!head -1 Past_blastx_uniprot.sql.tab" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Manually uploading Past_blastx_uniprot_sql.tab to SQLShare and joining with GOSlim:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###First upload dataset\n", "![screen shot1](https://github.com/jldimond/Coral-CpG-ratio-MS/blob/master/images/Screen%20Shot%202015-09-25%20at%2012.01.38%20PM.png?raw=true)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###Then find the dataset, execute query, and download the new dataset\n", "![screen shot](https://github.com/jldimond/Coral-CpG-ratio-MS/blob/master/images/Screen%20Shot%202015-09-25%20at%2012.29.18%20PM.png?raw=true)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##Query (note: insert your SQLShare account instead of jldimond@washington.edu)\n", "`SELECT Distinct Column1 as ContigID, GOSlim_bin\n", " FROM [jldimond@washington.edu].[Past_blastx_uniprot.sql.tab]anno\n", " left join [sr320@washington.edu].[SPID and GO Numbers]go\n", " on anno.Column3=go.SPID\n", " left join [sr320@washington.edu].[GO_to_GOslim]slim\n", " on go.GOID=slim.GO_id where aspect like 'P'`" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "##Output file downloaded to ./Analyses/Past/Past_GOSlim.csv" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ContigID,GOSlim_bin\r", "\r\n", "GCKDGN103GZQ3J,developmental processes\r", "\r\n", "isotig03630,transport\r", "\r\n", "GCKDGN103G9D8B,stress response\r", "\r\n", "isotig20992,developmental processes\r", "\r\n", "isotig30372,DNA metabolism\r", "\r\n", "isotig19723,developmental processes\r", "\r\n", "isotig20617,cell cycle and proliferation\r", "\r\n", "GCKDGN101BS14I,DNA metabolism\r", "\r\n", "isotig07465,stress response\r", "\r\n" ] } ], "source": [ "!head -10 Past_GOSlim.csv" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 54490 134548 1925703 /Users/jd/Documents/Projects/Coral-CpG-ratio-MS/analyses/Past/Past_GOSlim.csv\r\n" ] } ], "source": [ "!wc Past_GOSlim.csv" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Replacing commas with tabs \n", "!tr ',' \"\\t\" Past_GOSlim.tab" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ContigID\tGOSlim_bin\r", "\r\n", "GCKDGN103GZQ3J\tdevelopmental processes\r", "\r\n", "isotig03630\ttransport\r", "\r\n", "GCKDGN103G9D8B\tstress response\r", "\r\n", "isotig20992\tdevelopmental processes\r", "\r\n", "isotig30372\tDNA metabolism\r", "\r\n", "isotig19723\tdevelopmental processes\r", "\r\n", "isotig20617\tcell cycle and proliferation\r", "\r\n", "GCKDGN101BS14I\tDNA metabolism\r", "\r\n", "isotig07465\tstress response\r", "\r\n" ] } ], "source": [ "!head Past_GOSlim.tab" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 54490 189038 1925703 Past_GOSlim.tab\r\n" ] } ], "source": [ "!wc Past_GOSlim.tab" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ContigID\tGOSlim_bin\r", "\r\n", "GCKDGN101A00AO\tother biological processes\r", "\r\n", "GCKDGN101A00ZL\tcell organization and biogenesis\r", "\r\n", "GCKDGN101A028J\tother biological processes\r", "\r\n", "GCKDGN101A02W5\ttransport\r", "\r\n", "GCKDGN101A03QE\tother metabolic processes\r", "\r\n", "GCKDGN101A03XE\tother metabolic processes\r", "\r\n", "GCKDGN101A0607\tprotein metabolism\r", "\r\n", "GCKDGN101A06FB\tprotein metabolism\r", "\r\n", "GCKDGN101A0884\tother biological processes\r", "\r\n", "GCKDGN101A08CL\tprotein metabolism\r", "\r\n", "GCKDGN101A08WT\ttransport\r", "\r\n", "GCKDGN101A096T\tother metabolic processes\r", "\r\n", "GCKDGN101A097A\tother metabolic processes\r", "\r\n", "GCKDGN101A0ABX\tstress response\r", "\r\n", "GCKDGN101A0AQ5\tother metabolic processes\r", "\r\n", "GCKDGN101A0CJ2\tother biological processes\r", "\r\n", "GCKDGN101A0I25\tdevelopmental processes\r", "\r\n", "GCKDGN101A0KR8\tother metabolic processes\r", "\r\n", "GCKDGN101A0KSC\tother biological processes\r", "\r\n", "GCKDGN101A0M5U\ttransport\r", "\r\n", "GCKDGN101A0P88\ttransport\r", "\r\n", "GCKDGN101A0Q54\tsignal transduction\r", "\r\n", "GCKDGN101A0QJB\tcell cycle and proliferation\r", "\r\n", "GCKDGN101A0RO1\tother metabolic processes\r", "\r\n", "GCKDGN101A0S0M\ttransport\r", "\r\n", "GCKDGN101A0S2C\tstress response\r", "\r\n", "GCKDGN101A0S84\tdevelopmental processes\r", "\r\n", "GCKDGN101A0SI8\tstress response\r", "\r\n", "GCKDGN101A0UWI\tother biological processes\r", "\r\n", "GCKDGN101A0ZFB\tcell organization and biogenesis\r", "\r\n", "GCKDGN101A10C1\tother metabolic processes\r", "\r\n", "GCKDGN101A12VJ\tother biological processes\r", "\r\n", "GCKDGN101A13AQ\tother metabolic processes\r", "\r\n", "GCKDGN101A13V9\tDNA metabolism\r", "\r\n", "GCKDGN101A14KT\tother metabolic processes\r", "\r\n", "GCKDGN101A14VE\tprotein metabolism\r", "\r\n", "GCKDGN101A15QU\tsignal transduction\r", "\r\n", "GCKDGN101A17ZT\tstress response\r", "\r\n", "GCKDGN101A18J6\tcell organization and biogenesis\r", "\r\n", "GCKDGN101A19EE\tother metabolic processes\r", "\r\n", "GCKDGN101A1CPM\tother biological processes\r", "\r\n", "GCKDGN101A1KRI\tRNA metabolism\r", "\r\n", "GCKDGN101A1LTI\tRNA metabolism\r", "\r\n", "GCKDGN101A1O0W\tprotein metabolism\r", "\r\n", "GCKDGN101A1Q78\tcell cycle and proliferation\r", "\r\n", "GCKDGN101A1QPN\tother metabolic processes\r", "\r\n", "GCKDGN101A1TLT\ttransport\r", "\r\n", "GCKDGN101A1TOY\tother biological processes\r", "\r\n", "GCKDGN101A1V7Z\tother metabolic processes\r", "\r\n", "GCKDGN101A1VC2\tother metabolic processes\r", "\r\n", "GCKDGN101A2042\tsignal transduction\r", "\r\n", "GCKDGN101A20KS\tprotein metabolism\r", "\r\n", "GCKDGN101A22GA\tprotein metabolism\r", "\r\n", "GCKDGN101A23YI\tdevelopmental processes\r", "\r\n", "GCKDGN101A28J5\ttransport\r", "\r\n", "GCKDGN101A2BO2\tother biological processes\r", "\r\n", "GCKDGN101A2BOP\tother biological processes\r", "\r\n", "GCKDGN101A2DX5\tother metabolic processes\r", "\r\n", "GCKDGN101A2DZK\tother biological processes\r", "\r\n", "GCKDGN101A2EHS\tprotein metabolism\r", "\r\n", "GCKDGN101A2FEY\tother metabolic processes\r", "\r\n", "GCKDGN101A2JR9\tsignal transduction\r", "\r\n", "GCKDGN101A2K37\tother metabolic processes\r", "\r\n", "GCKDGN101A2LIL\tstress response\r", "\r\n", "GCKDGN101A2NQR\ttransport\r", "\r\n", "GCKDGN101A2NZY\ttransport\r", "\r\n", "GCKDGN101A2PU8\tother biological processes\r", "\r\n", "GCKDGN101A2QVI\tother biological processes\r", "\r\n", "GCKDGN101A2R04\tother metabolic processes\r", "\r\n", "GCKDGN101A2TT4\tother biological processes\r", "\r\n", "GCKDGN101A2V3C\tother metabolic processes\r", "\r\n", "GCKDGN101A2WY5\tother biological processes\r", "\r\n", "GCKDGN101A32G8\tother biological processes\r", "\r\n", "GCKDGN101A33CU\tother metabolic processes\r", "\r\n", "GCKDGN101A34D9\tdeath\r", "\r\n", "GCKDGN101A35AI\ttransport\r", "\r\n", "GCKDGN101A35CP\tstress response\r", "\r\n", "GCKDGN101A38H8\tother metabolic processes\r", "\r\n", "GCKDGN101A38VU\tprotein metabolism\r", "\r\n", "GCKDGN101A39MO\tdeath\r", "\r\n", "GCKDGN101A39U8\tprotein metabolism\r", "\r\n", "GCKDGN101A3A1B\tRNA metabolism\r", "\r\n", "GCKDGN101A3AVK\tother metabolic processes\r", "\r\n", "GCKDGN101A3EUL\tprotein metabolism\r", "\r\n", "GCKDGN101A3FW4\tRNA metabolism\r", "\r\n", "GCKDGN101A3GMY\tprotein metabolism\r", "\r\n", "GCKDGN101A3LA5\tprotein metabolism\r", "\r\n", "GCKDGN101A3MZM\tsignal transduction\r", "\r\n", "GCKDGN101A3N2H\tprotein metabolism\r", "\r\n", "GCKDGN101A3NNK\tother biological processes\r", "\r\n", "GCKDGN101A3QHI\tRNA metabolism\r", "\r\n", "GCKDGN101A3RJK\tcell adhesion\r", "\r\n", "GCKDGN101A3SQJ\tother biological processes\r", "\r\n", "GCKDGN101A3TPT\tother metabolic processes\r", "\r\n", "GCKDGN101A3WTT\tprotein metabolism\r", "\r\n", "GCKDGN101A3WZT\tDNA metabolism\r", "\r\n", "GCKDGN101A3YQM\tstress response\r", "\r\n", "GCKDGN101A3ZIL\tdeath\r", "\r\n", "GCKDGN101A3ZVE\tother metabolic processes\r", "\r\n" ] } ], "source": [ "!sort -u -k1,1 Past_GOSlim.tab > Past_GOSlim.tab.uniq\n", "!head -100 Past_GOSlim.tab.uniq" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 23848 82189 839095 Past_GOSlim.tab.uniq\r\n" ] } ], "source": [ "!wc Past_GOSlim.tab.uniq" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 13789 193046 1031497 Past_blastx_uniprot.sql.tab\r\n" ] } ], "source": [ "!wc Past_blastx_uniprot.sql.tab" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "GCKDGN101A00AO\tsp\tC8YR32\tLOXH1_MOUSE\t40.00\t65\t39\t0\t198\t4\t653\t717\t2e-09\t58.2\r\n", "GCKDGN101A00ZL\tsp\tQ23979\tMY61F_DROME\t70.97\t124\t35\t1\t371\t3\t322\t445\t3e-54\t 187\r\n", "GCKDGN101A02W5\tsp\tQ9PT84\tKCNH2_CHICK\t42.35\t85\t43\t2\t238\t2\t75\t159\t2e-09\t57.0\r\n", "GCKDGN101A03XE\tsp\tQ5RA96\tGUAA_PONAB\t76.15\t109\t26\t0\t338\t12\t28\t136\t1e-51\t 179\r\n", "GCKDGN101A06FB\tsp\tQ5PQ63\tIMP2L_XENLA\t65.45\t55\t19\t0\t7\t171\t105\t159\t2e-12\t63.9\r\n", "GCKDGN101A08CL\tsp\tQ09328\tMGT5A_HUMAN\t57.89\t57\t24\t0\t87\t257\t141\t197\t2e-16\t79.0\r\n", "GCKDGN101A097A\tsp\tQ3MHG6\tGTPBA_BOVIN\t43.42\t76\t42\t1\t17\t244\t81\t155\t6e-13\t66.6\r\n", "GCKDGN101A0KSC\tsp\tQ6P2K6\tP4R3A_MOUSE\t60.00\t90\t33\t1\t273\t4\t96\t182\t2e-25\t 105\r\n", "GCKDGN101A0M8H\tsp\tA2RUR9\tC144A_HUMAN\t29.31\t116\t75\t2\t386\t42\t1232\t1341\t3e-09\t58.5\r\n", "GCKDGN101A0NCL\tsp\tQ08AV6\tTBCC1_XENLA\t40.78\t103\t59\t2\t8\t310\t178\t280\t2e-17\t81.3\r\n", "GCKDGN101A0S0M\tsp\tO00763\tACACB_HUMAN\t67.20\t125\t41\t0\t381\t7\t326\t450\t8e-55\t 191\r\n", "GCKDGN101A12VJ\tsp\tQ99LS1\tMMAD_MOUSE\t60.98\t41\t16\t0\t499\t377\t250\t290\t2e-10\t61.6\r\n", "GCKDGN101A13V9\tsp\tQ9EPU0\tRENT1_MOUSE\t79.78\t89\t18\t0\t6\t272\t390\t478\t2e-45\t 161\r\n", "GCKDGN101A15QU\tsp\tQ5F361\tTBCK_CHICK\t84.13\t63\t10\t0\t198\t10\t532\t594\t3e-32\t 122\r\n", "GCKDGN101A17ZT\tsp\tQ60452\tERCC2_CRIGR\t74.63\t134\t34\t0\t406\t5\t312\t445\t4e-67\t 221\r\n", "GCKDGN101A19EE\tsp\tQ8BLI4\tDSE_MOUSE\t46.30\t108\t47\t2\t7\t330\t394\t490\t1e-23\t 100\r\n", "GCKDGN101A1O0W\tsp\tQ63415\tPCSK6_RAT\t56.90\t58\t25\t0\t292\t119\t265\t322\t2e-14\t73.2\r\n", "GCKDGN101A1T4S\tsp\tQ8NBL1\tPGLT1_HUMAN\t57.14\t63\t26\t1\t191\t6\t148\t210\t1e-17\t79.7\r\n", "GCKDGN101A1TLT\tsp\tP52825\tCPT2_MOUSE\t75.81\t62\t14\t1\t188\t6\t541\t602\t1e-24\t 100\r\n", "GCKDGN101A20KS\tsp\tQ9ULH0\tKDIS_HUMAN\t55.34\t103\t45\t1\t6\t314\t397\t498\t2e-33\t 130\r\n", "GCKDGN101A23YI\tsp\tQ8WXG9\tGPR98_HUMAN\t42.31\t78\t43\t1\t12\t245\t5976\t6051\t2e-19\t87.0\r\n", "GCKDGN101A28J5\tsp\tQ75NA5\tGBRB_MUSDO\t36.94\t111\t66\t2\t328\t2\t68\t176\t7e-18\t82.8\r\n", "GCKDGN101A2DZK\tsp\tA6H603\tNWD1_MOUSE\t41.03\t117\t68\t1\t14\t361\t13\t129\t3e-23\t99.8\r\n", "GCKDGN101A2LIL\tsp\tD4ACP5\tRECQ5_RAT\t64.94\t77\t27\t0\t231\t1\t14\t90\t5e-25\t 103\r\n", "GCKDGN101A2NZY\tsp\tQ6INQ6\tS246A_XENLA\t47.01\t117\t58\t1\t6\t344\t298\t414\t2e-30\t 118\r\n", "GCKDGN101A2R04\tsp\tA9CPT4\tTDRD1_ORYLA\t45.07\t71\t38\t1\t60\t269\t438\t508\t2e-09\t58.5\r\n", "GCKDGN101A2WY5\tsp\tQ1LWV7\tTLDC1_DANRE\t32.31\t65\t43\t1\t399\t208\t256\t320\t4e-06\t48.9\r\n", "GCKDGN101A35AI\tsp\tQ96JC1\tVPS39_HUMAN\t60.87\t46\t18\t0\t140\t3\t501\t546\t6e-11\t62.8\r\n", "GCKDGN101A38VU\tsp\tP62876\tRPAB5_MOUSE\t91.53\t59\t5\t0\t177\t1\t1\t59\t1e-34\t 119\r\n", "GCKDGN101A39U8\tsp\tQ5U5R9\tHECD2_HUMAN\t61.76\t34\t13\t0\t4\t105\t375\t408\t3e-06\t47.4\r\n", "GCKDGN101A3AVK\tsp\tQ66H39\tABCF3_RAT\t67.86\t56\t18\t0\t301\t468\t544\t599\t1e-17\t84.0\r\n", "GCKDGN101A3LA5\tsp\tQ6N069\tNAA16_HUMAN\t49.52\t105\t53\t0\t333\t19\t692\t796\t4e-18\t84.0\r\n", "GCKDGN101A3LGC\tsp\tQ8N3P4\tVPS8_HUMAN\t52.05\t73\t35\t0\t296\t78\t451\t523\t2e-16\t79.7\r\n", "GCKDGN101A3QHI\tsp\tO42202\tNDF1_DANRE\t62.07\t58\t21\t1\t268\t95\t97\t153\t3e-12\t66.6\r\n", "GCKDGN101A3RJK\tsp\tP26038\tMOES_HUMAN\t61.82\t55\t21\t0\t238\t402\t255\t309\t2e-17\t82.4\r\n", "GCKDGN101A3SQJ\tsp\tQ80U70\tSUZ12_MOUSE\t41.43\t70\t35\t2\t341\t135\t301\t365\t3e-11\t63.5\r\n", "GCKDGN101A3YQM\tsp\tQ9UKK3\tPARP4_HUMAN\t41.53\t118\t65\t3\t1\t351\t755\t869\t4e-19\t87.4\r\n", "GCKDGN101A44RN\tsp\tO88277\tFAT2_RAT\t44.62\t65\t33\t2\t347\t162\t3376\t3440\t4e-07\t52.4\r\n", "GCKDGN101A45U8\tsp\tQ9UBB5\tMBD2_HUMAN\t64.52\t62\t21\t1\t187\t5\t149\t210\t8e-21\t87.8\r\n", "GCKDGN101A46X0\tsp\tQ5U315\tGLD2_RAT\t41.44\t111\t56\t4\t3\t329\t319\t422\t2e-14\t72.4\r\n", "GCKDGN101A4FGV\tsp\tQ9Y5I1\tPCDAB_HUMAN\t44.53\t128\t60\t3\t380\t6\t260\t379\t2e-26\t 108\r\n", "GCKDGN101A4ILS\tsp\tA6QNR1\tRRP36_BOVIN\t52.78\t72\t34\t0\t138\t353\t98\t169\t4e-13\t67.0\r\n", "GCKDGN101A4JZ5\tsp\tQ9P2N4\tATS9_HUMAN\t35.37\t147\t77\t4\t395\t9\t1403\t1549\t1e-14\t75.1\r\n", "GCKDGN101A4LIT\tsp\tQ8MI29\tCBR1_MACFA\t53.26\t92\t37\t2\t272\t3\t68\t155\t4e-23\t95.1\r\n", "GCKDGN101A4PKS\tsp\tQ9HDZ5\tYKP9_SCHPO\t42.42\t99\t57\t0\t1\t297\t191\t289\t4e-18\t82.4\r\n", "GCKDGN101A4RYG\tsp\tP51532\tSMCA4_HUMAN\t53.42\t73\t34\t0\t224\t6\t1462\t1534\t2e-22\t95.5\r\n", "GCKDGN101A4SUY\tsp\tP33450\tFAT_DROME\t39.02\t123\t73\t1\t1\t369\t3421\t3541\t1e-20\t92.0\r\n", "GCKDGN101A4VAF\tsp\tQ64676\tCGT_MOUSE\t57.14\t49\t21\t0\t159\t305\t306\t354\t6e-12\t65.1\r\n", "GCKDGN101A4VKU\tsp\tQ5RE70\tINT3_PONAB\t80.26\t76\t15\t0\t228\t1\t343\t418\t6e-38\t 139\r\n", "GCKDGN101A4W2H\tsp\tQ61457\tCCNE1_MOUSE\t65.79\t114\t39\t0\t349\t8\t112\t225\t2e-48\t 165\r\n", "GCKDGN101A4WM3\tsp\tP14868\tSYDC_HUMAN\t83.33\t84\t14\t0\t252\t1\t396\t479\t2e-38\t 138\r\n", "GCKDGN101A4XPZ\tsp\tQ96L58\tB3GT6_HUMAN\t61.90\t84\t32\t0\t3\t254\t246\t329\t9e-33\t 122\r\n", "GCKDGN101A51N3\tsp\tQ5REW9\tANR27_PONAB\t40.70\t86\t51\t0\t2\t259\t463\t548\t8e-15\t73.6\r\n", "GCKDGN101A5213\tsp\tQ5R7S4\tSTRP1_PONAB\t89.42\t104\t11\t0\t313\t2\t668\t771\t1e-51\t 177\r\n", "GCKDGN101A527Z\tsp\tQ9D4H8\tCUL2_MOUSE\t65.15\t66\t23\t0\t204\t7\t405\t470\t5e-11\t62.8\r\n", "GCKDGN101A52QR\tsp\tQ6V0I7\tFAT4_HUMAN\t50.57\t87\t36\t3\t329\t75\t3884\t3965\t2e-25\t 105\r\n", "GCKDGN101A52V3\tsp\tQ69ZX6\tMOR2A_MOUSE\t54.95\t91\t37\t2\t10\t279\t140\t227\t8e-27\t 109\r\n", "GCKDGN101A53V7\tsp\tQ5ZJZ6\tNCBP1_CHICK\t42.06\t107\t62\t0\t33\t353\t540\t646\t6e-25\t 103\r\n", "GCKDGN101A56R5\tsp\tP27544\tCERS1_HUMAN\t49.28\t69\t35\t0\t7\t213\t122\t190\t6e-17\t77.0\r\n", "GCKDGN101A5AD9\tsp\tQ4UMH6\tY381_RICFE\t40.00\t85\t47\t1\t36\t290\t637\t717\t7e-07\t50.8\r\n", "GCKDGN101A5BL5\tsp\tQ7TN88\tPK1L2_MOUSE\t47.15\t123\t63\t1\t365\t3\t1366\t1488\t2e-32\t 126\r\n", "GCKDGN101A5DJ7\tsp\tQ13356\tPPIL2_HUMAN\t61.54\t65\t24\t1\t293\t99\t1\t64\t2e-15\t75.9\r\n", "GCKDGN101A5FF0\tsp\tA1A5F2\tHTR5B_XENTR\t53.62\t69\t32\t0\t117\t323\t1108\t1176\t2e-15\t77.0\r\n", "GCKDGN101A5KEV\tsp\tF6QEU4\tLIN41_XENTR\t45.12\t82\t39\t2\t56\t283\t583\t664\t5e-11\t63.9\r\n", "GCKDGN101A5P57\tsp\tQ8IWR1\tTRI59_HUMAN\t40.24\t82\t39\t2\t253\t35\t4\t84\t6e-13\t68.6\r\n", "GCKDGN101A5QSY\tsp\tQ9UPW5\tCBPC1_HUMAN\t77.36\t53\t12\t0\t224\t66\t1087\t1139\t3e-21\t91.7\r\n", "GCKDGN101A5S8T\tsp\tP04047\tKITH_CHICK\t51.22\t82\t39\t1\t259\t17\t1\t82\t3e-23\t94.0\r\n", "GCKDGN101A5YWS\tsp\tQ52KW8\tRCC2_XENLA\t52.60\t154\t71\t1\t27\t482\t89\t242\t7e-46\t 162\r\n", "GCKDGN101A5ZFE\tsp\tQ9NYQ6\tCELR1_HUMAN\t41.67\t84\t48\t1\t4\t252\t2500\t2583\t1e-16\t80.9\r\n", "GCKDGN101A5ZON\tsp\tQ9P225\tDYH2_HUMAN\t45.71\t105\t57\t0\t349\t35\t2163\t2267\t2e-25\t 106\r\n", "GCKDGN101A63TD\tsp\tB3A0N5\tAPY_TABYA\t42.67\t75\t42\t1\t250\t26\t477\t550\t1e-13\t69.3\r\n", "GCKDGN101A640O\tsp\tQ9BXW6\tOSBL1_HUMAN\t56.67\t60\t26\t0\t33\t212\t833\t892\t5e-18\t82.0\r\n", "GCKDGN101A642G\tsp\tQ91W43\tGCSP_MOUSE\t68.50\t127\t39\t1\t5\t385\t465\t590\t5e-54\t 187\r\n", "GCKDGN101A68P2\tsp\tQ9D2V5\tAAR2_MOUSE\t67.74\t62\t20\t0\t193\t8\t5\t66\t2e-26\t 105\r\n", "GCKDGN101A6A53\tsp\tQ8BG58\tP4HTM_MOUSE\t38.68\t106\t55\t4\t395\t93\t273\t373\t6e-12\t66.2\r\n", "GCKDGN101A6HZZ\tsp\tQ63955\tPO4F3_MOUSE\t87.80\t82\t10\t0\t3\t248\t178\t259\t9e-43\t 147\r\n", "GCKDGN101A6KTM\tsp\tQ9UBG0\tMRC2_HUMAN\t46.88\t64\t26\t2\t198\t16\t262\t320\t4e-09\t56.2\r\n", "GCKDGN101A6KXV\tsp\tP10394\tPOL4_DROME\t35.21\t71\t46\t0\t7\t219\t907\t977\t9e-08\t52.4\r\n", "GCKDGN101A6L12\tsp\tQ8TE73\tDYH5_HUMAN\t25.45\t110\t74\t1\t369\t40\t1827\t1928\t3e-09\t58.5\r\n", "GCKDGN101A6N29\tsp\tP25291\tGP2_CANFA\t39.19\t74\t36\t3\t223\t2\t31\t95\t1e-06\t49.3\r\n", "GCKDGN101A6QFQ\tsp\tP28227\tMAOX_ANAPL\t70.59\t85\t25\t0\t256\t2\t120\t204\t1e-26\t 106\r\n", "GCKDGN101A6RZZ\tsp\tQ9VPH8\tRBBP5_DROME\t60.00\t110\t44\t0\t336\t7\t99\t208\t1e-38\t 139\r\n", "GCKDGN101A6TK0\tsp\tQ499Y0\tXPO4_XENLA\t58.12\t117\t49\t0\t355\t5\t2\t118\t2e-38\t 142\r\n", "GCKDGN101A6W27\tsp\tP61590\tMCPH1_COLGU\t41.51\t53\t31\t0\t174\t16\t761\t813\t3e-06\t47.8\r\n", "GCKDGN101A6YC9\tsp\tA1L0Y2\tNEUL4_XENTR\t35.42\t144\t75\t4\t421\t5\t360\t490\t4e-16\t79.3\r\n", "GCKDGN101A6YHL\tsp\tQ9ESW0\tDDB1_RAT\t83.12\t77\t12\t1\t300\t70\t106\t181\t2e-38\t 142\r\n", "GCKDGN101A7EWB\tsp\tP11531\tDMD_MOUSE\t37.40\t123\t70\t2\t353\t6\t2803\t2925\t2e-15\t77.0\r\n", "GCKDGN101A7IKO\tsp\tQ8WN22\tPRKDC_CANFA\t31.11\t90\t62\t0\t281\t12\t78\t167\t2e-12\t69.7\r\n", "GCKDGN101A7L4B\tsp\tA9LMC0\tDPF3_DANRE\t80.00\t30\t6\t0\t380\t291\t350\t379\t2e-10\t61.2\r\n", "GCKDGN101A7L4Y\tsp\tQ7T3Q7\tOPSO_RUTRU\t37.66\t77\t48\t0\t41\t271\t240\t316\t2e-12\t67.4\r\n", "GCKDGN101A7NLU\tsp\tP26369\tU2AF2_MOUSE\t69.01\t71\t21\t1\t218\t6\t220\t289\t4e-25\t 103\r\n", "GCKDGN101A7R5V\tsp\tQ52KG5\tKI26A_MOUSE\t60.94\t64\t25\t0\t483\t292\t661\t724\t4e-18\t86.3\r\n", "GCKDGN101A7UY5\tsp\tQ9UIF8\tBAZ2B_HUMAN\t43.69\t103\t58\t0\t349\t41\t2063\t2165\t9e-23\t98.2\r\n", "GCKDGN101A7XOY\tsp\tQ9R0W9\tACHA6_MOUSE\t37.97\t79\t49\t0\t8\t244\t233\t311\t8e-17\t80.1\r\n", "GCKDGN101A831E\tsp\tP10711\tTCEA1_MOUSE\t52.54\t59\t25\t1\t254\t78\t1\t56\t7e-12\t63.9\r\n", "GCKDGN101A84QZ\tsp\tQ9NZJ4\tSACS_HUMAN\t42.55\t94\t48\t2\t290\t27\t646\t739\t2e-06\t49.3\r\n", "GCKDGN101A8JQQ\tsp\tQ2HJ51\tRA51D_BOVIN\t43.96\t91\t49\t1\t333\t61\t98\t186\t4e-19\t84.0\r\n", "GCKDGN101A8JYC\tsp\tQ7Z6Z7\tHUWE1_HUMAN\t40.28\t72\t42\t1\t45\t257\t3322\t3393\t8e-09\t56.2\r\n", "GCKDGN101A8KGC\tsp\tQ08890\tIDS_MOUSE\t49.38\t81\t41\t0\t248\t6\t18\t98\t2e-19\t86.3\r\n", "GCKDGN101A8LD2\tsp\tQ8C5V5\tWDR27_MOUSE\t39.29\t84\t51\t0\t259\t8\t498\t581\t2e-15\t75.1\r\n" ] } ], "source": [ "!sort -u -k1,1 Past_blastx_uniprot.sql.tab > Past_blastx_uniprot.sql.tab.uniq\n", "!head -100 Past_blastx_uniprot.sql.tab.uniq" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.9" } }, "nbformat": 4, "nbformat_minor": 0 }