{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Blast and GOSlim annotation of *Stylophora pistillata* transcriptome " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This workflow details the annotation of a *Stylophora pistillata* [transcriptome](http://data.centrescientifique.mc/Data/454Isotigs.fas.zip)\n", "\n", "The notebook requires you have the following \n", "- [NCBI Blast: 2.2.3](ftp://ftp.ncbi.nlm.nih.gov/blast/executables/LATEST/)\n", "- [SQLShare](https://sqlshare.escience.washington.edu/accounts/login/?next=/sqlshare/%3F__hash__%3D)\n", "\n", "The annotation also requires a Uniprot/Swissprot BLAST database. Instructions for setting up this database can be found [here](https://github.com/jldimond/Coral-CpG-ratio-MS/blob/master/README.md)\n", "\n", "The orginal analysis was carried out on on Mac OS X v10.10.3 running Python: 2.7.9 and IPython: 3.1.0.\n", "\n", "This workflow is structured so that anyone can reproduce the analysis by downloading the repository locally and executing." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/jd/Documents/Projects/Coral-CpG-ratio-MS/data/Spist\n" ] } ], "source": [ "cd ../data/Spist" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", "100 4779k 100 4779k 0 0 99645 0 0:00:49 0:00:49 --:--:-- 99428\n" ] } ], "source": [ "#Obtain FASTA file\n", "!curl -O http://data.centrescientifique.mc/Data/454Isotigs.fas.zip" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Archive: 454Isotigs.fas.zip\n", " inflating: 454Isotigs.fas \n", " creating: __MACOSX/\n", " inflating: __MACOSX/._454Isotigs.fas \n" ] } ], "source": [ "!unzip 454Isotigs.fas.zip" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Rename to Spist.fasta\n", "!mv ._454Isotigs.fas Spist.fasta" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!head Spist.fasta" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ">Spi_isotig15030 gene=isogroup12260 length=112\r", "\r\n", "GCCAGCTGTTGTGTCACAaGATTCAGATTTTAAACCCcaGATCACCACAGAAGAggTGAAAGGCTAATCGAACTTGGAAATAGCTGGTTTTCTGCGAAAACTaTTTAAGTAG\r", "\r\n", ">Spi_isotig15031 gene=isogroup12261 length=103\r", "\r\n", "ATGTAaGGAGAAGATTTTtAGGGAAGGAGAGTTCACCCTTAGAACcGATCCGTGAGcTGGGTTGATAGCGTCTGtCGGCAGTTTTTAcctcATGGTCTGATTT\r", "\r\n", ">Spi_isotig15032 gene=isogroup12262 length=95\r", "\r\n", "AATTCATCCATTTGTTTTTGTTGACTAaTATTATCTTCAGTGgCCTtCTCCCCTTCTTTCGGTAGAGGCGCCATCTTTAATTTTCAACACCTCTC\r", "\r\n", ">Spi_isotig15033 gene=isogroup12263 length=88\r", "\r\n", "GTGTGTGTGTGtGCGTGTGTGtGtGTGTGTGTGTGTGTGtgtgTGAGTGTGATTGagCACATTCTCATAAGTCTCGCATTagCGGAAT\r", "\r\n", ">Spi_isotig15034 gene=isogroup12264 length=75\r", "\r\n", "ggCTtGGCGAAgAGtAGatCATTAACTACATTCAGTCTCACAGTTAaCagTCACGCTGACAAGATGGgAGACTGG\r", "\r\n" ] } ], "source": [ "!tail Spist.fasta" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "15052\r\n" ] } ], "source": [ "!fgrep -c \">\" Spist.fasta" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Blastx query" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!blastx \\\n", "-query Spist.fasta \\ #FASTA file\n", "-db ~blast/db/uniprot_sprot \\ #Use your blastx database address\n", "-max_target_seqs 1 \\ #maximum number of target sequences = 1\n", "-max_hsps 1 \\ #maximum number of high-scoring pairs = 1\n", "-outfmt 6 \\ #output format = tabular\n", "-evalue 1E-05 \\ #E-value = 10^-5\n", "-num_threads 8 \\ #number of threads = 8\n", "-out ../analyses/Spist/Spist_blastx_uniprot.tab \\ #Direct output to analyses directory\n", "2> ../analyses/Spist/Spist_blastx_uniprot.error #Direct standard error output to its own file" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/jd/Documents/Projects/Coral-CpG-ratio-MS/analyses/Spist\n" ] } ], "source": [ "cd ../../analyses/Spist" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Spi_contig00032\tsp|Q9W6H5|UBC9A_DANRE\t85.81\t155\t22\t0\t133\t597\t1\t155\t1e-95\t 293\r\n", "Spi_contig00035\tsp|P80204|TGFR1_RAT\t69.77\t258\t78\t0\t2\t775\t244\t501\t2e-129\t 397\r\n", "Spi_contig00040\tsp|Q35101|COX1_METSE\t82.85\t449\t72\t1\t605\t1936\t70\t518\t0.0\t 665\r\n", "Spi_contig00046\tsp|E7EY42|PTSS2_DANRE\t56.41\t234\t101\t1\t19\t720\t26\t258\t6e-91\t 281\r\n", "Spi_contig00075\tsp|Q37556|NU1M_METSE\t80.16\t247\t48\t1\t1719\t2456\t29\t275\t3e-104\t 333\r\n", "Spi_contig00094\tsp|P00519|ABL1_HUMAN\t48.15\t54\t28\t0\t2494\t2333\t443\t496\t2e-09\t65.1\r\n", "Spi_contig00095\tsp|P20693|FCER2_MOUSE\t28.57\t203\t122\t5\t831\t229\t128\t309\t6e-18\t87.0\r\n", "Spi_contig00834\tsp|P19615|MYP_STRPU\t32.94\t598\t321\t18\t50\t1741\t132\t683\t2e-84\t 294\r\n", "Spi_isotig00002\tsp|Q5I2B1|ACTPG_OULOR\t56.49\t131\t56\t1\t1429\t1037\t7\t136\t6e-41\t 151\r\n", "Spi_isotig00005\tsp|C9EIC7|ACTP1_URTCR\t51.67\t120\t57\t1\t1444\t1085\t54\t172\t2e-31\t 126\r\n" ] } ], "source": [ "!head -10 Spist_blastx_uniprot.tab" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Spi_isotig14970\tsp|Q9Y2Y1|RPC10_HUMAN\t82.35\t51\t7\t1\t29\t175\t58\t108\t4e-23\t91.3\r\n", "Spi_isotig14971\tsp|Q6DK93|CNFN_XENTR\t44.19\t43\t24\t0\t320\t192\t66\t108\t2e-06\t46.2\r\n", "Spi_isotig14981\tsp|Q0VCN3|IFT27_BOVIN\t60.53\t38\t15\t0\t306\t193\t148\t185\t3e-06\t46.6\r\n", "Spi_isotig14983\tsp|P62972|UBIQP_XENLA\t71.43\t70\t20\t0\t89\t298\t25\t94\t1e-25\t99.4\r\n", "Spi_isotig14984\tsp|P18700|TBB_STRPU\t94.62\t93\t5\t0\t18\t296\t105\t197\t6e-59\t 188\r\n", "Spi_isotig14993\tsp|Q9C000|NALP1_HUMAN\t35.56\t90\t30\t1\t13\t198\t807\t896\t4e-07\t51.2\r\n", "Spi_isotig14997\tsp|Q09694|LYS1_SCHPO\t40.45\t89\t52\t1\t1\t267\t96\t183\t1e-18\t82.4\r\n", "Spi_isotig14998\tsp|Q291J5|CLU_DROPS\t35.00\t80\t52\t0\t33\t272\t1214\t1293\t6e-06\t47.4\r\n", "Spi_isotig15014\tsp|Q26630|IDLC_STRPU\t84.62\t39\t6\t0\t88\t204\t1\t39\t1e-14\t69.3\r\n", "Spi_isotig15025\tsp|Q7SYU0|RS8_XENLA\t87.80\t41\t5\t0\t123\t1\t66\t106\t8e-18\t76.6\r\n" ] } ], "source": [ "!tail -10 Spist_blastx_uniprot.tab" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 7130 85560 565789 /Users/jay/Desktop/blast_jobs/Spist_blastx_uniprot.tab\r\n" ] } ], "source": [ "!wc Spist_blastx_uniprot.tab" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Spi_contig00032\tsp|Q9W6H5|UBC9A_DANRE\t85.81\t155\t22\t0\t133\t597\t1\t155\t1e-95\t 293\n", "SQLShare ready version has Pipes converted to Tabs ....\n", "Spi_contig00032\tsp\tQ9W6H5\tUBC9A_DANRE\t85.81\t155\t22\t0\t133\t597\t1\t155\t1e-95\t 293\n" ] } ], "source": [ "#Removing pipes and converted to tab-delimited file\n", "!tr '|' \"\\t\" Spist_blastx_uniprot.sql.tab\n", "!head -1 Spist_blastx_uniprot.tab\n", "!echo SQLShare ready version has Pipes converted to Tabs ....\n", "!head -1 Spist_blastx_uniprot.sql.tab" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#Manually uploading Pdam_blastx_uniprot_sql.tab to SQLShare and joining with GOSlim:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###First upload dataset\n", "![screen shot1](https://github.com/jldimond/Coral-CpG-ratio-MS/blob/master/images/Screen%20Shot%202015-09-25%20at%2012.01.38%20PM.png?raw=true)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###Then find the dataset, execute query, and download the new dataset\n", "![screen shot](https://github.com/jldimond/Coral-CpG-ratio-MS/blob/master/images/Screen%20Shot%202015-09-25%20at%2012.29.18%20PM.png?raw=true)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##Query (note: insert your SQLShare account instead of jldimond@washington.edu)\n", "`SELECT Distinct Column1 as ContigID, GOSlim_bin\n", " FROM [jldimond@washington.edu].[Spist_blastx_uniprot_sql.tab]anno\n", " left join [sr320@washington.edu].[SPID and GO Numbers]go\n", " on anno.Column3=go.SPID\n", " left join [sr320@washington.edu].[GO_to_GOslim]slim\n", " on go.GOID=slim.GO_id where aspect like 'P'`" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "##Output file downloaded to ./analyses/Spist/Spist_GOSlim.csv" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ContigID,GOSlim_bin\r", "\r\n", "Spi_isotig07984,protein metabolism\r", "\r\n", "Spi_isotig04931,developmental processes\r", "\r\n", "Spi_isotig04931,cell organization and biogenesis\r", "\r\n", "Spi_isotig04931,other biological processes\r", "\r\n", "Spi_isotig10815,other metabolic processes\r", "\r\n", "Spi_isotig08055,transport\r", "\r\n", "Spi_isotig08055,other biological processes\r", "\r\n", "Spi_isotig12889,RNA metabolism\r", "\r\n", "Spi_isotig02897,RNA metabolism\r", "\r\n" ] } ], "source": [ "!head -10 Spist_GOSlim.csv" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Replacing commas with tabs \n", "!tr ',' \"\\t\" Spist_GOSlim.tab" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ContigID\tGOSlim_bin\r", "\r\n", "Spi_isotig07984\tprotein metabolism\r", "\r\n", "Spi_isotig04931\tdevelopmental processes\r", "\r\n", "Spi_isotig04931\tcell organization and biogenesis\r", "\r\n", "Spi_isotig04931\tother biological processes\r", "\r\n", "Spi_isotig10815\tother metabolic processes\r", "\r\n", "Spi_isotig08055\ttransport\r", "\r\n", "Spi_isotig08055\tother biological processes\r", "\r\n", "Spi_isotig12889\tRNA metabolism\r", "\r\n", "Spi_isotig02897\tRNA metabolism\r", "\r\n" ] } ], "source": [ "!head Spist_GOSlim.tab" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.9" } }, "nbformat": 4, "nbformat_minor": 0 }