{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Removing predicted *Symbiodinium* sequences from *Pocillopora damicornis* transcriptome" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### This workflow creates a BLAST database from the *Symbiodinium* clade A and B transcriptomes from [Bayer et al (2012)](http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0035269) and compares the *P. damicornis* transcriptome from [Vidal-Dupiol et al. (2013)](http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0058652) against this database. " ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/jd/ncbi-blast-2.2.30+/db\n" ] } ], "source": [ "#Locate your local blast database\n", "cd /Users/jd/ncbi-blast-2.2.30+/db" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "33496_Ahyacinthus_CoralContigs.fasta Apalm.nsq\r\n", "Ahya.nhr Pdam.fasta\r\n", "Ahya.nin Pdam.nhr\r\n", "Ahya.nsq Pdam.nin\r\n", "\u001b[31mAmil_Moya.fasta\u001b[m\u001b[m* Pdam.nsq\r\n", "Amil_Moya.nhr \u001b[31mSpist.fasta\u001b[m\u001b[m*\r\n", "Amil_Moya.nin Spist.nhr\r\n", "Amil_Moya.nsq Spist.nin\r\n", "Apalm.fasta Spist.nsq\r\n", "Apalm.nhr SymbiodiniumAB_Bayer.fasta\r\n", "Apalm.nin\r\n" ] } ], "source": [ "ls" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "Building a new DB, current time: 06/08/2015 17:28:18\n", "New DB name: SymbiodiniumAB\n", "New DB title: SymbiodiniumAB_Bayer.fasta\n", "Sequence type: Nucleotide\n", "Keep Linkouts: T\n", "Keep MBits: T\n", "Maximum file size: 1000000000B\n", "Adding sequences from FASTA; added 148436 sequences in 6.22721 seconds.\n" ] } ], "source": [ "#Create database from Symbiodinium transcriptome\n", "!makeblastdb -in SymbiodiniumAB_Bayer.fasta -dbtype nucl -out SymbiodiniumAB" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "head: ../../data/Pdam/blast2go_fasta_Pdamv2: No such file or directory\r\n" ] } ], "source": [ "!head ../../data/Pdam/blast2go_fasta_Pdamv2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# BLAST query" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!blastn \\\n", "-query /Users/jd/Documents/Projects/Coral-CpG-ratio-MS/data/Pdam/blast2go_fasta_Pdamv2.fasta \\\n", "-db /Users/jd/ncbi-blast-2.2.30\\+/db/SymbiodiniumAB \\\n", "-outfmt 6 \\\n", "-evalue 1E-05 \\\n", "-out ../analyses/Pdam/Pdam_zoox_matches" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/jd/Documents/Projects/Coral-CpG-ratio-MS/analyses/Pdam\n" ] } ], "source": [ "cd ../analyses/Pdam" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "transcripts_v2_15|pyruvate\tmf105_rep_c5718\t84.13\t2709\t416\t13\t1996\t4695\t3027\t324\t0.0\t 2623\r\n", "transcripts_v2_15|pyruvate\tmf105_rep_c5718\t85.35\t1672\t238\t6\t265\t1932\t324\t1992\t0.0\t 1727\r\n", "transcripts_v2_15|pyruvate\tmf105_rep_c15738\t85.78\t1034\t141\t5\t3666\t4695\t1165\t134\t0.0\t 1094\r\n", "transcripts_v2_15|pyruvate\tmf105_rep_c15738\t85.78\t1034\t141\t5\t265\t1294\t134\t1165\t0.0\t 1090\r\n", "transcripts_v2_15|pyruvate\tmf105_rep_c18202\t85.71\t952\t131\t5\t3721\t4670\t1209\t261\t0.0\t 1003\r\n", "transcripts_v2_15|pyruvate\tmf105_rep_c18202\t85.71\t952\t131\t5\t290\t1239\t261\t1209\t0.0\t 1000\r\n", "transcripts_v2_15|pyruvate\tmf105_s62900\t85.01\t447\t59\t8\t4020\t4461\t456\t13\t7e-124\t 448\r\n", "transcripts_v2_15|pyruvate\tmf105_s62900\t84.56\t447\t61\t8\t499\t940\t13\t456\t2e-120\t 436\r\n", "transcripts_v2_15|pyruvate\tmf105_s66380\t85.98\t371\t52\t0\t609\t979\t1\t371\t7e-109\t 398\r\n", "transcripts_v2_15|pyruvate\tmf105_s66380\t85.98\t371\t52\t0\t3981\t4351\t371\t1\t7e-109\t 398\r\n" ] } ], "source": [ "!head Pdam_zoox_matches" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 19174 230088 2319915 Pdam_zoox_matches\r\n" ] } ], "source": [ "!wc Pdam_zoox_matches" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "C ID_CpGsorted.tab Pdam_GOSlim.tab\r\n", "CG Immune_contigs Pdam_cpg_GOslim\r\n", "CpG OA_diff_CpG Pdam_cpg_GOslim.tab\r\n", "G OA_diff_CpG.tab Pdam_zoox_matches\r\n", "ID_CpG OA_diff_contigs comb\r\n", "ID_CpG.sorted OA_diff_contigs.sorted fasta2tab\r\n", "ID_CpG.sorted2 OA_diff_contigs.sorted2 length\r\n", "ID_CpG.sorted3 OA_diff_contigs.sorted3 tab_1\r\n", "ID_CpG.sorted4 OA_diff_contigs_CpG tab_2\r\n", "ID_CpG.sorted5 OA_diff_contigs_sorted4\r\n", "ID_CpGsorted.csv Pdam_GOSlim.sorted\r\n" ] } ], "source": [ "ls" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "transcripts_v2_15|pyruvate \t\n", "transcripts_v2_15|pyruvate \t\n", "transcripts_v2_15|pyruvate \t\n", "transcripts_v2_15|pyruvate \t\n", "transcripts_v2_15|pyruvate \t\n", "transcripts_v2_15|pyruvate \t\n", "transcripts_v2_15|pyruvate \t\n", "transcripts_v2_15|pyruvate \t\n", "transcripts_v2_15|pyruvate \t\n", "transcripts_v2_15|pyruvate \t\n", " 19174 19174 1300261 Pdam_zoox_matches_IDs\n" ] } ], "source": [ "#Printing only the contig ID from the output file\n", "!awk '{print $1, \"\\t\"}' Pdam_zoox_matches > Pdam_zoox_matches_IDs\n", "!head Pdam_zoox_matches_IDs\n", "!wc Pdam_zoox_matches_IDs" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "transcripts_v2_15\r\n", "transcripts_v2_15\r\n", "transcripts_v2_15\r\n", "transcripts_v2_15\r\n", "transcripts_v2_15\r\n", "transcripts_v2_15\r\n", "transcripts_v2_15\r\n", "transcripts_v2_15\r\n", "transcripts_v2_15\r\n", "transcripts_v2_15\r\n" ] } ], "source": [ "#Removing annotation after pipe\n", "!sed 's/[|].*$//' Pdam_zoox_matches_IDs > Pdam_zoox_matches_IDs2\n", "!head Pdam_zoox_matches_IDs2" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "transcripts_v2_15\n", "transcripts_v2_16\n", "transcripts_v2_17\n", "transcripts_v2_34\n", "Locus_12087_Transcript_1/1_Confidence_1.000_Length_3121_transcripts_v2_67\n", "Locus_1523_Transcript_1/10_Confidence_0.419_Length_2225_transcripts_v2_169\n", "transcripts_v2_170\n", "transcripts_v2_194\n", "Locus_1050_Transcript_1/1_Confidence_1.000_Length_2076_transcripts_v2_204\n", "Locus_15745_Transcript_1/2_Confidence_1.000_Length_1994_transcripts_v2_237\n", " 2898 2898 191689 Pdam_zoox_matches_IDs_uniq\n" ] } ], "source": [ "#Obtaining only unique seq IDs\n", "!uniq Pdam_zoox_matches_IDs2 > Pdam_zoox_matches_IDs_uniq\n", "!head Pdam_zoox_matches_IDs_uniq\n", "!wc Pdam_zoox_matches_IDs_uniq" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Locus_10029_Transcript_1/1_Confidence_1.000_Length_440_transcripts_v2_4203 \t\r\n", "Locus_1004_Transcript_1/2_Confidence_1.000_Length_214_transcripts_v2_8828\r\n", "Locus_10055_Transcript_1/1_Confidence_1.000_Length_198_transcripts_v2_9556\r\n", "Locus_10088_Transcript_1/1_Confidence_1.000_Length_452_transcripts_v2_4080 \t\r\n", "Locus_10180_Transcript_1/1_Confidence_1.000_Length_555_transcripts_v2_3246 \t\r\n", "Locus_101_Transcript_1/1_Confidence_1.000_Length_258_transcripts_v2_7225\r\n", "Locus_10200_Transcript_1/1_Confidence_1.000_Length_591_transcripts_v2_2994 \t\r\n", "Locus_10222_Transcript_1/1_Confidence_1.000_Length_677_transcripts_v2_2481 \t\r\n", "Locus_10250_Transcript_1/3_Confidence_0.360_Length_118_transcripts_v2_11963\r\n", "Locus_10250_Transcript_2/3_Confidence_0.260_Length_112_transcripts_v2_11990\r\n" ] } ], "source": [ "#Sorting file in order to join\n", "!sort Pdam_zoox_matches_IDs_uniq > Pdam_zoox_matches_IDs.sorted\n", "!head Pdam_zoox_matches_IDs.sorted" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Locus_10000_Transcript_2/3_Confidence_0.667_Length_676_transcripts_v2_2486 \t 0.761861\n", "Locus_10001_Transcript_1/1_Confidence_1.000_Length_199_transcripts_v2_9515 \t 0.635946\n", "Locus_10002_Transcript_1/1_Confidence_1.000_Length_695_transcripts_v2_2386 \t 0.695709\n", "Locus_10003_Transcript_1/1_Confidence_1.000_Length_609_transcripts_v2_2870 \t 0.12449\n", "Locus_10004_Transcript_1/1_Confidence_1.000_Length_174_transcripts_v2_10976 \t 0.230119\n", "Locus_10005_Transcript_1/1_Confidence_1.000_Length_207_transcripts_v2_9134 \t 0.530625\n", "Locus_10006_Transcript_1/1_Confidence_1.000_Length_167_transcripts_v2_11475 \t 1.34405\n", "Locus_10007_Transcript_1/2_Confidence_0.857_Length_1261_transcripts_v2_788 \t 0.746746\n", "Locus_10007_Transcript_2/2_Confidence_0.857_Length_1272_transcripts_v2_779 \t 0.758383\n", "Locus_1000_Transcript_1/1_Confidence_1.000_Length_292_transcripts_v2_6389 \t 0.266367\n", " 72890 145780 5804568 ID_CpG.sorted\n" ] } ], "source": [ "#Now using CpG ratio file obtained from workflow in Pdam_CpG_ratio.ipynb\n", "!head ID_CpG.sorted\n", "!wc ID_CpG.sorted" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Locus_10000_Transcript_2/3_Confidence_0.667_Length_676_transcripts_v2_2486 0.761861\n", "Locus_10001_Transcript_1/1_Confidence_1.000_Length_199_transcripts_v2_9515 0.635946\n", "Locus_10002_Transcript_1/1_Confidence_1.000_Length_695_transcripts_v2_2386 0.695709\n", "Locus_10003_Transcript_1/1_Confidence_1.000_Length_609_transcripts_v2_2870 0.12449\n", "Locus_10004_Transcript_1/1_Confidence_1.000_Length_174_transcripts_v2_10976 0.230119\n", "Locus_10005_Transcript_1/1_Confidence_1.000_Length_207_transcripts_v2_9134 0.530625\n", "Locus_10006_Transcript_1/1_Confidence_1.000_Length_167_transcripts_v2_11475 1.34405\n", "Locus_10007_Transcript_1/2_Confidence_0.857_Length_1261_transcripts_v2_788 0.746746\n", "Locus_10007_Transcript_2/2_Confidence_0.857_Length_1272_transcripts_v2_779 0.758383\n", "Locus_1000_Transcript_1/1_Confidence_1.000_Length_292_transcripts_v2_6389 0.266367\n", " 69992 139984 5443744 ID_CpG.sorted2\n" ] } ], "source": [ "!join -v 1 ID_CpG.sorted Pdam_zoox_matches_IDs.sorted > ID_CpG.sorted2\n", "!head ID_CpG.sorted2\n", "!wc ID_CpG.sorted2" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Process removed 2898 sequences with close matches to Symbiodinium transcriptome" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.9" } }, "nbformat": 4, "nbformat_minor": 0 }