{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Blast and GOSlim annotation of *Pocillopora damicornis* transcriptome " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This workflow details the annotation of a *Pocillopora damicornis* [transcriptome](http://2ei.univ-perp.fr/telechargement/transcriptomes/blast2go_fasta_Pdamv2.zip)\n", "\n", "The notebook requires you have the following \n", "- [NCBI Blast: 2.2.3](ftp://ftp.ncbi.nlm.nih.gov/blast/executables/LATEST/)\n", "- [SQLShare](https://sqlshare.escience.washington.edu/accounts/login/?next=/sqlshare/%3F__hash__%3D)\n", "\n", "The annotation also requires a Uniprot/Swissprot BLAST database. Instructions for setting up this database can be found [here](https://github.com/jldimond/Coral-CpG-ratio-MS/blob/master/README.md)\n", "\n", "The orginal analysis was carried out on on Mac OS X v10.10.3 running Python: 2.7.9 and IPython: 3.1.0.\n", "\n", "This workflow is structured so that anyone can reproduce the analysis by downloading the repository locally and executing." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/jd/Documents/Projects/Coral-CpG-ratio-MS/data/Pdam\n" ] } ], "source": [ "cd ../data/Pdam" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", "100 9425k 100 9425k 0 0 841k 0 0:00:11 0:00:11 --:--:-- 1128k\n" ] } ], "source": [ "#Obtain FASTA file\n", "!curl -O http://2ei.univ-perp.fr/telechargement/transcriptomes/blast2go_fasta_Pdamv2.zip" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Archive: blast2go_fasta_Pdamv2.zip\n", " inflating: blast2go_fasta_Pdamv2.fasta \n" ] } ], "source": [ "!unzip -o blast2go_fasta_Pdamv2.zip" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ">Locus_1685_Transcript_1/2_Confidence_1.000_Length_7457_transcripts_v2_1|spectrin alpha chain\r", "\r\n", "tatacgattttatgccgtggaggtgttttcttgcagaagtttcaaattatgtcctaattgtagtgtagaacggactattggacataatttgaaacttctgcaagaaaacacctccacggcataaaatcgtatatggcgatacatgaaccactgcttcaggaacactgtcttccttttgatgtttaaaccatgcactggcccacatttttgtttgctaatataacagaacttttccgctatccctaaccaagtaacaactcatcactctaaacataataacatgcactgaagaaacattacattagtaattctcttgaacactgtgatcagttaatctgttgatgatagcttcagtaatacctagcgagctgctggggtattgtcatcttccagttacattaaaccgtgaggtgtttcgtttcttcagaaataaggaacataccacaattactgcaaatctatgccggactgactaatttagctagaagagaagagttcctcgcagaaagtcttgtagtcgtaagctcccggaacttctcggcccttgtcgtcaacgtaaggattcatacgatcaatacagaagtcggcctgctccttggtgagagactgatagagctcagcctcagtaacatagagacgttttccgccctcagtgagcgccttgaaggcgttgatgacttcttgactagagccgacattttctgtttcacgactgatcatgaaggccatgtactctcccatcgacaccacaccgtcaccgttgggatccaccgttctgaggatgctttggaactccggatcttcttcgccctcttctacaatggagagatcgtagccaagagaacggaggcatgacttgaattcttgatgatccaggtaaccagtcttatccttgtcaaagtgcttgaacatgatagtgaattctttgagtgtatcctcagaaacgccagtggtattgcgggcctggatttgctgttcaagattgtgtttcatgcgcatggcaagttgatccagctgatcccactgctgagcaaggtccactgtgctatgttcggtatacttgttgtccaggataagagcctcttccatttgagccccaagatcctccaggatcctaaggtcctctttcctgtctgctatctcagcactcttcttcttgacctcggccagttgatcttcaaggtccccttgaccgtcaaccattgcaacccttgtatcagagagccaagcgtggaacgagttggcagcttgtgcaaattcctgcctgaggttgtcattatactcctgacgctgggcttctttgcgcaaatcatcctcgcgttcctcaataatcttctgcagattttcccacgtgtcttccaatgcttccatagtaaaccaagtataagggttgatggagacgttatagctcttgatctgacgatcaagcttcctcaacatcataatgtcgtcttccgcttggtccagagaagcgcggaactgggtgtggccatcttgcaaagcacgtatctcctcaacagagttgcagcgcactggatcggtcagatcttcctccgcattctcgaaccaactgttaaaagcagaggctttcttggcaaacaacaggaacaagtcctccaccttcttgtattgatcctgggcatgttgaagtcgttcttttcgagtcttagagtcttccagcagctgctcccacctcctaataaggtcatcatgacgtttgatgatggcaggcgactgctcgtgttgtgattggacaagctcgtctttcagcgcagtcacacgtgcaatgccttcgttttcaaatgcttggagacctgagtcaaaggtttcctgtttggtgaaaagtgtttgcaccgaagacaaatcgcgccccagatcatctgaccgagcaattccttccttgtcaccaatccaggattccactacatcggccttccaattgaactgcaggaaagctgagttgtcattgagcttggacttgcgataggacgccattctctccaactcggtaagtttactcttaatggatgcaatcctgtggtcaatcaactcagattggtggttacccttttcaatgagcttgttgccagcatcttcaatgtcctgaactctttcccgatgaacctcaagatctgtctcgaatgcttcgtgcttctttaaaagaccttggactgcagccagtgtatcaccatagtcgtcacttccaacaagagtgttcttctcattgatccacgattcctcttcaccaacattagcgctgaactgttgatattccagagactcatctagcttgtgctgcctctgctcagccaggtttttcagttcgtcccagttttcctgcagctgatcacagcgagctttgatttcatcagagctactgtgtccttcgtcaataaatttacggccacacagtagtacagcctgaattctggcctcatgggtgttcaattcagcttccaaacgctgatgtttcttacgtaggttttgtacaccagtcaagtcctttccataatcttcagagctggtcaatagctttttctcctttatccaggactcttcgtcgtccacatctcggtaaaactggtgcaaagcgttggactcgtccaacttcttatggcgatctgcggccatgtcttttactttgtcgtagcgttctgcaataatgcgggatttgtcttgcagtgagtcagcatcaaaatggccaacctcagcgaagtgctgggtttgggcttgaagatcagtgatacgatcctcgtgagcagcaatatctgcttcaacaagttggtgcttcttgatcagattctgaacactggccagatctcgaccatgatcatcgtgagacaaggaagcttcaacctctcccagccagaaatccagttccttaacatttgtattaaattgctgttgttgattagattccttcagatgttgactcttctcattggacttctgtactaggtattcccactgctgctgaagtttggtgattctctctttcacaatctcctcacttccggcacacttgcgctgctcaatgagaccttcagctaggttgatggtctccaagactctctcctgattggctgagacttcagcttcaaacgcctgatgcttttggaacttggactgtaagttggttggatccttgtaagattcatccagcactgtctgtaacttttcactgacccatgcttcaatgtcctcagcatcacgactgaactgctgaatggttttggactcacccagttttgatcggcgttcaaccaaggctgccttcagagtagcccatctcgctaagactgcagaaatcctctctgcaatagctggagagtcataatgattgttatcaatcagccgatcagcattgtctttcagagcattgatcttcacgtcctgcactgcaagtgtctttgtgaagtcttcatgtttcttgatcagagcttcagcaccctccgctgcttctccgacatcttcgctctgaatgatggcctcacgtgtcgccatccactgctccagttgttcggcatctcgattgaataactgcaactccaaacattcatcgagtctttgcttgcgagaagcccaggccttttccagttcttctcgttctgtagccatggtctccagtttttcacggatgtcgggactagcataatgttctttgtccaataatttcttgccgaaatcttcaaacgactggaactcgctgtcacgcgcatcaatttctgcgcgatgttcctgatgcctgtccaacagagcttctgcactggccacatcctttgcaagttcatcagaagttaccaatgccataataccattgatccatgaagtaagatctctgaattcgttgaggaaatggaagtaatctgacgagtcactcagtttgccccttcttgcagccgcctgttctctgagattagcccacgcttcttctagctcgtgtctcttccattcaatgtcttgtgctgcattcggatgactggatgttaacttagcagcttccacgttgagttcacgaaccttctcttccaatgcggccaagtctctctccaaaacttcatgctttctctgcaaggcctggacactagccaaatcccgaccataatctgaggtggaaagagcagtgtctttctctgaaatccaagccttggtgtcatcagcatccctgtggaatttctgaatttcctgagcattatccaaattttccttcctctgctctgacatggtcttaagattggtccatttctggttcagcctctcaatcatctctcggatcatctcatattctctgtagtgttcaatctttagtttctcagccagtaatgtcagttctctgatacgaacctctttagaaatcatatccttttcaaagtcatcaaacttcttctgctgagcttccacatgttcgtagtcctttccgatttcctctgaggtgacaattgcttccttgtcattgatccaagactccagttcgtgagcatcccgcaacacactgtatctttgaatggaatcttccaatttcttcttacgttcttcgcctttgtccatcaggtcttggtatttatcatccaaagcctgctgtctgttagcaacagtgtccacttcaggagcctgcgacagaaggtcctgtgaggctttgtgggagtcaatcctcttgacatatgccgcaggaacaaaaccttgtctgtcattagtttctactttccaccaatccttgttgctagaattgagtagggtcagaatgtcacccttctgcatggacacctctcttgcagttttctcctgatagtcataaagagcaactacacattccttgtcagagatatctgtgacatgagccgctggcttgcagtgttgactttgttctctcagcccatctacaacagttccatatgctctcaagtctgacatgatggcatcatgtttggtaagcaatgcctgtgcactgtcttcatcttttccatagtcatcactggtaacaattggttctttctccttcaaccaggattctgcttcagcaacatcagctagatactgatgagcttgaagagagtcatccagatgtccttttcgcacatgtgccttatccttgaattccagccatttctggtcaaggtcatcaatcttctctttgatctcatcagcggcaaagtgcccattatctatcatttgaacaccattatcgcaaacagctctgacacgtggttcatgaccagcaatttcagtcattaaagcctggtgtttctttgacagattctgggctcctgtcaaatctcggccagtgtttgtggatgaagcaacaggttccttctcccttatccatgcctcctcatcttcaacatcatgaaggaatcgtttaagtctctcagcatcctggagctttgctttgcgagctaggagaggagcctgtagctgttggtatcggctatttaaaacctccttcttttctttgatggaaggagcatcaaagtggtcagcctccgcaaataaattagcttgtgcattaacaacctcaatcttttctgcacgcgctatgacatcagcctctatcatggcatgtttcttttggaggttttgcacacttgtcagatcctttcctacatcctcaagagcaagcaagttttcaacttcagtgaaccacaactcaacatcttcggcacctcggttgaactgctgttgctgtgctgcttctttcaatttcaaccctttatcatttgatctctcaaacagataggcccacaacttatgcagttcatcaagtctctctctgatttgatctgaggcatagtgttcatctccaatcagttgttcaccagtgttgtcaactgcatcaagccggctttgattggcattcaactcagcctcaaatgcttggtgcttctggatctttccttgcagattggttggatccttgtaagattcatcactggcaatcttcagcttttctgtaatccagctttttacttcatcacagtctctctcaaactgctgcagtttacgagattcctcaagtttcaacctgcgcgccttggacaactccccaatgttattacgcctttccattataccatctcttctctcacgcacctcatctgaggcatagtgattgctgtcaacaaggcggtttgcatactcgtcaatactattgatcttctctgcttgggcagcaaaagatttgtcaaagtcttcatgcttcctgatcagagcttctactccatcgagagaatcaccaaggttatcatcagcaaggaaggcctcttgtttagacatagtggcatctgcatgctcacaatcccgattaaaaagctgcagctccatacactgctcaaactgcacacgtctcctctcccacagctccaacagttccattttctcagtctccagactggcaagcttttctttgatctcatcagtagcataatgattggaattgacaagttcttctccatcatcggcaaacttcttaaatccatcctctgatgcatcaatataacccttatgttcctgatgacgttccagaagactttctgcactagctacatcttttgccagttcatcactttggatcaagactttcatttcatttataaaagaaatgtggtccctgtagtcactgatgaacctttgcagacgataagaatcctccaaccttgcttttcgcaccccagacttctccttaagatttccccaggcagttacaatttcatcttgcttagcagctatctcatctgcactttctggatatgcctcttgcagttgtgcagattctgaacccagggcagtgaccttatcttccactgcagccaaatctctttctagagcttcatgtttgcgtaacagagcattgacactggccagatcttttccataatcatctgatgacagaactttgtctttctcattgatccaattcttggtctcatcagcatcacgatagaagctgtgtatttgctgggcaccagccagtctcttctgacgtttgagtgccagcatcttcagcctttcccaagcttcattaacttccgcttgctttgtactgatcagttctatatctggatgaccctcatctccaagttgatgtgcaagctcattgatgtatgtgacccttgattcatttgcctgaatatccttcaagaagtcctcaaatttcttctgaagaacttccacatgttccaaatctcttcctacttcttcagaagtggcaattgcttctttttctaaaatccatgacataacttcctctgtttcatgcaagaagtggactcttttctgagtaaagagaagcatgcggcctttctctgctgatttggaaagcagcaattcccataacttgatgagtgagtcaagacgttccctgataagttcagaggcatagtgggactcactgatcatgccttcaccattttcttggagttcaataatggcattgctatgagctgaaatctctgcttcaaacg\r", "\r\n", ">Locus_1685_Transcript_2/2_Confidence_1.000_Length_7457_transcripts_v2_2|spectrin alpha chain\r", "\r\n", "tatacgattttatgccgtggaggtgttttcttgcagaagtttcaaattatgtcctaattgtagtgtagaacggactattggacataatttgaaacttctgcaagaaaacacctccacggcataaaatcgtatatggcgatacatgaaccactgcttcaggaacactgtcttccttttgatgtttaaaccatgcactggcccacatttttgtttgctaatataacagaacttttccgctatccctaaccaagtaacaactcatcactctaaacataataacatgcactgaagaaacattacattagtaattctcttgaacactgtgatcagttaatctgttgatgatagcttcagtaatacctagcgagctgctggggtattgtcatcttccagttacattaaaccgtgaggtgtttcgtttcttcagaaataaggaacataccacaattactgcaaatctatgccggactgactaatttagctagaagagaagagttcctcgcagaaagtcttgtagtcgtaagctcccggaacttctcggcccttgtcgtcaacgtaaggattcatacgatcaatacagaagtcggcctgctccttagtgagagactgatagagctcagcctcagtaacatagagacgttttccaccctcagtgagcgccttgaaggcgttgatgacttcctggctagagccaacattttctgtttcacgactgatcatgaaggccatgtactctcccatcgacaccacaccgtcaccgttgggatccaccgttctgaggatgctttggaactccggatcttcttcgccctcttctacaatggagagatcgtagccaagagaacggaggcatgacttgaattcttgatgatccaggtaaccagtcttatccttgtcaaagtgcttgaacatgatagtgaattctttgagtgtatcctcagaaacgccagtggtattgcgggcctggatttgctgttcaagattgtgtttcatgcgcatggcaagttgatccagctgatcccactgctgagcaaggtccactgtgctatgttcggtatacttgttgtccaggataagagcctcttccatttgagccccaagatcctccaggatcctaaggtcctctttcctgtctgctatctcagcactcttcttcttgacctcggccagttgatcttcaaggtccccttgaccgtcaaccattgcaacccttgtatcagagagccaagcgtggaacgagttggcagcttgtgcaaattcctgcctgaggttgtcattatactcctgacgctgggcttctttgcgcaaatcatcctcgcgttcctcaataatcttctgcagattttcccacgtgtcttccaatgcttccatagtaaaccaagtataagggttgatggagacgttatagctcttgatctgacgatcaagcttcctcaacatcataatgtcgtcttccgcttggtccagagaagcgcggaactgggtgtggccatcttgcaaagcacgtatctcctcaacagagttgcagcgcactggatcggtcagatcttcctccgcattctcgaaccaactgttaaaagcagaggctttcttggcaaacaacaggaacaagtcctccaccttcttgtattgatcctgggcatgttgaagtcgttcttttcgagtcttagagtcttccagcagctgctcccacctcctaataaggtcatcatgacgtttgatgatggcaggcgactgctcgtgttgtgattggacaagctcgtctttcagcgcagtcacacgtgcaatgccttcgttttcaaatgcttggagacctgagtcaaaggtttcctgtttggtgaaaagtgtttgcaccgaagacaaatcgcgccccagatcatctgaccgagcaattccttccttgtcaccaatccaggattccactacatcggccttccaattgaactgcaggaaagctgagttgtcattgagcttggacttgcgataggacgccattctctccaactcggtaagtttactcttaatggatgcaatcctgtggtcaatcaactcagattggtggttacccttttcaatgagcttgttgccagcatcttcaatgtcctgaactctttcccgatgaacctcaagatctgtctcgaatgcttcgtgcttctttaaaagaccttggactgcagccagtgtatcaccatagtcgtcacttccaacaagagtgttcttctcattgatccacgattcctcttcaccaacattagcgctgaactgttgatattccagagactcatctagcttgtgctgcctctgctcagccaggtttttcagttcgtcccagttttcctgcagctgatcacagcgagctttgatttcatcagagctactgtgtccttcgtcaataaatttacggccacacagtagtacagcctgaattctggcctcatgggtgttcaattcagcttccaaacgctgatgtttcttacgtaggttttgtacaccagtcaagtcctttccataatcttcagagctggtcaatagctttttctcctttatccaggactcttcgtcgtccacatctcggtaaaactggtgcaaagcgttggactcgtccaacttcttatggcgatctgcggccatgtcttttactttgtcgtagcgttctgcaataatgcgggatttgtcttgcagtgagtcagcatcaaaatggccaacctcagcgaagtgctgggtttgggcttgaagatcagtgatacgatcctcgtgagcagcaatatctgcttcaacaagttggtgcttcttgatcagattctgaacactggccagatctcgaccatgatcatcgtgagacaaggaagcttcaacctctcccagccagaaatccagttccttaacatttgtattaaattgctgttgttgattagattccttcagatgttgactcttctcattggacttctgtactaggtattcccactgctgctgaagtttggtgattctctctttcacaatctcctcacttccggcacacttgcgctgctcaatgagaccttcagctaggttgatggtctccaagactctctcctgattggctgagacttcagcttcaaacgcctgatgcttttggaacttggactgtaagttggttggatccttgtaagattcatccagcactgtctgtaacttttcactgacccatgcttcaatgtcctcagcatcacgactgaactgctgaatggttttggactcacccagttttgatcggcgttcaaccaaggctgccttcagagtagcccatctcgctaagactgcagaaatcctctctgcaatagctggagagtcataatgattgttatcaatcagccgatcagcattgtctttcagagcattgatcttcacgtcctgcactgcaagtgtctttgtgaagtcttcatgtttcttgatcagagcttcagcaccctccgctgcttctccgacatcttcgctctgaatgatggcctcacgtgtcgccatccactgctccagttgttcggcatctcgattgaataactgcaactccaaacattcatcgagtctttgcttgcgagaagcccaggccttttccagttcttctcgttctgtagccatggtctccagtttttcacggatgtcgggactagcataatgttctttgtccaataatttcttgccgaaatcttcaaacgactggaactcgctgtcacgcgcatcaatttctgcgcgatgttcctgatgcctgtccaacagagcttctgcactggccacatcctttgcaagttcatcagaagttaccaatgccataataccattgatccatgaagtaagatctctgaattcgttgaggaaatggaagtaatctgacgagtcactcagtttgccccttcttgcagccgcctgttctctgagattagcccacgcttcttctagctcgtgtctcttccattcaatgtcttgtgctgcattcggatgactggatgttaacttagcagcttccacgttgagttcacgaaccttctcttccaatgcggccaagtctctctccaaaacttcatgctttctctgcaaggcctggacactagccaaatcccgaccataatctgaggtggaaagagcagtgtctttctctgaaatccaagccttggtgtcatcagcatccctgtggaatttctgaatttcctgagcattatccaaattttccttcctctgctctgacatggtcttaagattggtccatttctggttcagcctctcaatcatctctcggatcatctcatattctctgtagtgttcaatctttagtttctcagccagtaatgtcagttctctgatacgaacctctttagaaatcatatccttttcaaagtcatcaaacttcttctgctgagcttccacatgttcgtagtcctttccgatttcctctgaggtgacaattgcttccttgtcattgatccaagactccagttcgtgagcatcccgcaacacactgtatctttgaatggaatcttccaatttcttcttacgttcttcgcctttgtccatcaggtcttggtatttatcatccaaagcctgctgtctgttagcaacagtgtccacttcaggagcctgcgacagaaggtcctgtgaggctttgtgggagtcaatcctcttgacatatgccgcaggaacaaaaccttgtctgtcattagtttctactttccaccaatccttgttgctagaattgagtagggtcagaatgtcacccttctgcatggacacctctcttgcagttttctcctgatagtcataaagagcaactacacattccttgtcagagatatctgtgacatgagccgctggcttgcagtgttgactttgttctctcagcccatctacaacagttccatatgctctcaagtctgacatgatggcatcatgtttggtaagcaatgcctgtgcactgtcttcatcttttccatagtcatcactggtaacaattggttctttctccttcaaccaggattctgcttcagcaacatcagctagatactgatgagcttgaagagagtcatccagatgtccttttcgcacatgtgccttatccttgaattccagccatttctggtcaaggtcatcaatcttctctttgatctcatcagcggcaaagtgcccattatctatcatttgaacaccattatcgcaaacagctctgacacgtggttcatgaccagcaatttcagtcattaaagcctggtgtttctttgacagattctgggctcctgtcaaatctcggccagtgtttgtggatgaagcaacaggttccttctcccttatccatgcctcctcatcttcaacatcatgaaggaatcgtttaagtctctcagcatcctggagctttgctttgcgagctaggagaggagcctgtagctgttggtatcggctatttaaaacctccttcttttctttgatggaaggagcatcaaagtggtcagcctccgcaaataaattagcttgtgcattaacaacctcaatcttttctgcacgcgctatgacatcagcctctatcatggcatgtttcttttggaggttttgcacacttgtcagatcctttcctacatcctcaagagcaagcaagttttcaacttcagtgaaccacaactcaacatcttcggcacctcggttgaactgctgttgctgtgctgcttctttcaatttcaaccctttatcatttgatctctcaaacagataggcccacaacttatgcagttcatcaagtctctctctgatttgatctgaggcatagtgttcatctccaatcagttgttcaccagtgttgtcaactgcatcaagccggctttgattggcattcaactcagcctcaaatgcttggtgcttctggatctttccttgcagattggttggatccttgtaagattcatcactggcaatcttcagcttttctgtaatccagctttttacttcatcacagtctctctcaaactgctgcagtttacgagattcctcaagtttcaacctgcgcgccttggacaactccccaatgttattacgcctttccattataccatctcttctctcacgcacctcatctgaggcatagtgattgctgtcaacaaggcggtttgcatactcgtcaatactattgatcttctctgcttgggcagcaaaagatttgtcaaagtcttcatgcttcctgatcagagcttctactccatcgagagaatcaccaaggttatcatcagcaaggaaggcctcttgtttagacatagtggcatctgcatgctcacaatcccgattaaaaagctgcagctccatacactgctcaaactgcacacgtctcctctcccacagctccaacagttccattttctcagtctccagactggcaagcttttctttgatctcatcagtagcataatgattggaattgacaagttcttctccatcatcggcaaacttcttaaatccatcctctgatgcatcaatataacccttatgttcctgatgacgttccagaagactttctgcactagctacatcttttgccagttcatcactttggatcaagactttcatttcatttataaaagaaatgtggtccctgtagtcactgatgaacctttgcagacgataagaatcctccaaccttgcttttcgcaccccagacttctccttaagatttccccaggcagttacaatttcatcttgcttagcagctatctcatctgcactttctggatatgcctcttgcagttgtgcagattctgaacccagggcagtgaccttatcttccactgcagccaaatctctttctagagcttcatgtttgcgtaacagagcattgacactggccagatcttttccataatcatctgatgacagaactttgtctttctcattgatccaattcttggtctcatcagcatcacgatagaagctgtgtatttgctgggcaccagccagtctcttctgacgtttgagtgccagcatcttcagcctttcccaagcttcattaacttccgcttgctttgtactgatcagttctatatctggatgaccctcatctccaagttgatgtgcaagctcattgatgtatgtgacccttgattcatttgcctgaatatccttcaagaagtcctcaaatttcttctgaagaacttccacatgttccaaatctcttcctacttcttcagaagtggcaattgcttctttttctaaaatccatgacataacttcctctgtttcatgcaagaagtggactcttttctgagtaaagagaagcatgcggcctttctctgctgatttggaaagcagcaattcccataacttgatgagtgagtcaagacgttccctgataagttcagaggcatagtgggactcactgatcatgccttcaccattttcttggagttcaataatggcattgctatgagctgaaatctctgcttcaaacg\r", "\r\n", ">Locus_177_Transcript_12/12_Confidence_0.500_Length_6585_transcripts_v2_3|vitellogenin structural genes (yolk protein genes) family member (vit-1)-like\r", "\r\n", "gctcgagtttgtgtgatgcctctttacagagagcacagattccttggtttggctcggcctcaaacaaagcttgcttcccacaggtgtaacaaagacgattttcatatgacgcggatgcagtccgtcactcttctcagcattatgcgaaggcgaattcaagggcaagtcactgatatagcgccatttctcctcatatcccttctgtggttcaatatcttcgatcatgtgttggaataacgtgacttgatccgtgttcatagggacggttgcgcgggtccattccacagggttcttggatgcatctttgtgaatatacacaactgtcgcgaggcgaacatgacggcgaccattcacgaatgtgatcttgaagagcttggggtagcgattatcatgcacagtctctaaacacccttgggaatttggcctcaccttaacgaatccctttccggtgacaaaaagtttcacaaattgatgatttccgtcgataaagaagactttgaagacagtaggtagctcgcttaagacgagttcattgtgctcattcacagacaggtattttccatttcctgccttcagagtccagttggcataacgatgatgatgatgtcccctggcaatctcagtaacttctccgatctcaaatccgtccagctctgtgctcctgtagattgaagaccaactcttggaacctgtgacaggtttcctgtacatcagccaggtgttgctcgccttgttagttgataaaatctcctgtctctcttgtggcaagtccatttctgtggtgtagctttgatttctgatgttgtactggaatctgccacctaatccagtagtgatattggtgtggatcgttagcttgctaccagtctccaactgaggcagagatactttcatttttccccaaagagaatgaagaacactaatgttcaattttccactaagagtgacttgatcgggaagaggaatgaagaatcggggttgaacctcagctttgaccgtggtgttcaaacgcaggaacaaaacgctgtccacagagaaatccaggggcagaccggcacacgagggctggcgtatcttcacttcaattggacggaggattttggagatattccattttttacctttcttcagcaggtccttcagatcgttgctaactctgatgactccatcttccattaactctttgatgtcatcataggtgaaataattatagaacagttcttttccaaaaactttgaagtagaaggaacctttggctttctcctcttcatgttcagcggttggcagctggtgttggatttccttgatttctttcccattcagactgttgtctaagcttcgtttcggtctgagcaagccaaacagactcttatcttcgctgtagtgtccccttggtcccatgacacgttggagcagctcttggattccctcaccacggatgccagtctcgaatagattcagagattttccaagcataccggtgtgtagtttgactttagagttgatagctctcgggatgaagcttgatggcgtggaaatcatgcgaagatcaactgcacttcccatctggagaagatcggagaagtcgccaaagtgaagccatttggagttcactgggtttatcttgtatctcttaagcgctcgaagggcgattcggagagcgattcgtgcggtttgagccattttttcgtcacatgggtatctagagaaagccatccccttgatgtaagacaccacaaaagatcgcacttgattggtgcgctcgtaatagagctgcttgactaccaagttgaacaccgctggaccaggtttgcagtcgcagatgatcacaaaacaagccatgcgcagttcggcgtcgttgtttggttgacggaagatctctaacaacactggcaataccttttgtgaaattttcggagcgattcttcgtagagcatacacagctgtgacgcgtaattccagagagttcctgttgtccttgataatttccaaaagaggttggtaaaaatctctgtgaccgaaattaccgatgcccttcaagatgaagatcttttcacggtaactggcatactgcaaacgactcttaatctcagccacttggttggcaaggcaagtgacgtccttgtctttgcagaagtcatgcatcaaggcaccggcagtaaggtaacaggttttcttcagagtctcgtctgcttggacaacagaaccagaacagagatccacaacggtcaagcacatctttcctgttgggtttggagttaaggctagaccacgaataagtgagactgccctcccagagtccagttccaatgtcttgattttttctgagatgagttcaaagacctcctccttgtacacataaggcagagcttccagcagccattgcctgcgtttctcgtcattgtagcagttgtcccagatcttgataagagtttttttgcaggtcctgcgtaaggtttccaccaaacgactgaactgccaagacatcaaattattctcctccacgctcttgatcaggtcgtcgatcaattttctcaccatgtttgcagtctgggtacgttgttcctcagtctctgctttgtcctcatcctcatccacagtcatttccaatgtcctagctgtaacaccactaacggtatagctttgggattcagacctcacttccttgaatttcaggtattgcttggtcaaagttctggcatttcctttcatctgtgttggtgcaaaaatgtatcttcccatcgttctacaatccttcaccacaaattgttcctcggttccgaaaagatcacatgatgctgacacagagatctgtaggggctcatcaatgacgctgcagtcctcgctaagacagcgcactcctggtacgtttgaaaacacgcggggttttccaatgcagttgtgcaggtttaaggtctttgtaatttgcaattccttgattgtgttggtctccagtgtgttgtgacgcacatcatacaggttttcacacattccatgagtgctgagctcccagttcttgtacagtcttggttcacggtgatgtaatgaagactcaggcttggtgaagttgtggtggatcatggaaagaactcctcgtttgatgttcaaaatgtactcaggctcatcgtcattgcagaagatctgaccaacttttccatctttgtactcaaacttgattggtcgttccagcaacggcttcaattcattggagactggggagacagcgtgtgacacagacttctacttcgaaacagacactttggcttatttgaagataagaaacccaattctgtacgaagttaacgtaacatccttcgaccaatcagagtcacacactgtctccccagtctccaatgaattgaagccgttgctggaacgaccaatcaagtttgagtacaaagatggaaaagttggtcagatcttctgcaatgacgatgagcctgagtacattttgaacatcaaacgaggagttctttccatgatccaccacaacttcaccaagcctgagtcttcattacatcaccgtgaaccaagactgtacaagaactgggagctcagcactcatggaatgtgtgaaaacctgtatgatgtgcgtcacaacacactggagaccaacacaatcaaggaattgcaaattacaaagaccttaaacctgcacaactgcattggaaaaccccgcgtgttttcaaacgtaccaggagtgcgctgtcttagcgaggactgcagcgtcattgatgagcccctacagatctctgtgtcagcatcatgtgatcttttcggaaccgaggaacaatttgtggtgaaggattgtagaacgatgggaagatacatttttgcaccaacacagatgaaaggaaatgccagaactttgaccaagcaatacctgaaattcaaggaagtgaggtctgaatcccaaagctataccgttagtggtgttacagctaggacattggaaatgactgtggatgaggatgaggacaaagcagagactgaggaacaacgtacccagactgcaaacatggtgagaaaattgatcgacgacctgatcaagagcgtggaggagaataatttgatgtcttggcagttcagtcgtttggtggaaaccttacgcaggacctgcaaaaaaactcttatcaagatctgggacaactgctacaatgacgagaaacgcaggcaatggctgctggaagctctgccttatgtgtacaaggaggaggtctttgaactcatctcagaaaaaatcaagacattggaactggactctgggagggcagtctcacttattcgtggtctagccttaactccaaacccaacaggaaagatgtgcttgaccgttgtggatctctgttctggttctgttgtccaagcagacgagactctgaagaaaacctgttaccttactgccggtgccttgatgcatgacttctgcaaagacaaggacgtcacttgccttgccaaccaagtggctgagattaagagtcgtttgcagtatgccagttaccgtgaaaagatcttcatcttgaagggcatcggtaatttcggtcacagagatttttaccaacctcttttggaaattatcaaggacaacaggaactctctggaattacgcgtcacagctgtgtatgctctacgaagaatcgctccgaaaatttcacaaaaggtattgccagtgttgttagagatcttccgtcaaccaaacaacgacgccgaactgcgcatggcttgttttgtgatcatctgcgactgcaaacctggtccagcggtgttcaacttggtagtcaagcagctctattacgagcgcaccaatcaagtgcgatcttttgtggtgtcttacatcaaggggatggctttctctagatacccatgtgacgaaaaaatggctcaaaccgcacgaatcgctctccgaatcgcccttcgagcgcttaagagatacaagataaacccagtgaactccaaatggcttcactttggcgacttctccgatcttctccagatgggaagtgcagttgatcttcgcatgatttccacgccatcaagcttcatcccgagagctatcaactctaaagtcaaactacacaccggtatgcttggaaaatctctgaatctattcgagactggcatccgtggtgagggaatccaagagctgctccaacgtgtcatgggaccaaggggacactacagcgaagataagagtctgtttggcttgctcagaccgaaacgaagcttagacaacagtctgaatgggaaagaaatcaaggaaatccaacaccagctgccaaccgctgaacatgaagaggagaaagccaaaggttccttctacttcaaagtttttggaaaagaactgttctataattatttcacctatgatgacatcaaagagttaatggaagatggagtcatcagagttagcaacgatctgaaggacctgctgaagaaaggtaaaaaatggaatatctccaaaatcctccgtccaattgaagtgaagatacgccagccctcgtgtgccggtctgcccctggatttctctgtggacagcgttttgttcctgcgtttgaacaccacggtcaaagctgaggttcaaccccgattcttcattcctcttcccgatcaagtcactcttagtggaaaattgaacattagtgttcttcattctctttggggaaaaatgaaagtatctctgcctcagttggagactggtagcaagctaacgatccacaccaatatcactactggattaggtggcagattccagtacaacatcagaaatcaaagctacaccacagaaatggacttgccacaagagagacaggagattttatcaactaacaaggcgagcaacacctggctgatgtacaggaaacctgtcacaggttccaagagttggtcttcaatctacaggagcacagagctggacggatttgagatcggagaagttactgagattgccaggggacatcatcatcatcgttatgccaactggactctgaaggcaggaaatggaaaatacctgtctgtgaatgagcacaatgaactcgtcttaagcgagctacctactgtcttcaaagtcttctttatcgacggaaatcatcaatttgtgaaactttttgtcaccggaaagggattcgttaaggtgaggccaaattcccaagggtgtttagagactgtgcatgataatcgctaccccaagctcttcaagatcacattcgtgaatggtcgccgtcatgttcgcctcgcgacagttgtgtatattcacaaagatgcatccaagaaccctgtggaatggacccgcgcaaccgtccctatgaacacggatcaagtcacgttattccaacacatgatcgaagatattgaaccacagaagggatatgaggagaaatggcgctatatcagtgacttgcccttgaattcgccttcgcataatgctgagaagagtgacggactgcatccgcgtcatatgaaaatcgtctttgttacacctgtgggaagcaagctttgtttgaggccgagccaaaccaaggaatctgtgctctctgtaaagaggcatcacacaaactcgagcgaactgtgttatggcaaacaaatgct\r", "\r\n", ">transcripts_v2_5|---NA---\r", "\r\n", "gtgaagatgaggaatctaaggaatggcctgtttgcaaattgagttactggatttctctggcaagtagttttggatcaatttttcatactatttttcttgtgtgataagcaaatgattgcattccctgagacagttttttccaggacggtgaaatcagtgaaacaagagcctgtatgtcactagtgactctgagtaaggccttcatttgagcccagcattgcagtggcacttgtttaagttcttggagaagtgactgtaatcaactgatgacctctgcacagcagcaagaactagatgcaggcaaaagttttaaagtccctcagctttgctctgtagcatgctacctaaagttttcagatagtgagaattcatctgatggaagaagctaaatctttggattactatgagagtcagaaagcagtgtgtaacatgggtattactcataatttcaaagtatgtcaaggatgtcattaaggtagtgaacattttcttgagcttttgagtcttttttttctcttgtttattctgaattcggttttcagcagttttgtggaggaatctgtcggcaatgatcagtggtgattttctgccatagtggtggtgttaaatttcaattggcaacttcttgaggtggcattccagcttttagcaaggtgcaagttccactatcacttttagatgttaaatcagcaggcaaacccaaccaatttggacatttcaccattcacgtggctttcagaaaaaggcttgcagcatatcagtaggactggtctgtatgggtctggcctcaggtagctgaatttaactgagataatcctgcagaaaaaaagaggacagaagtacacgtacaatgactgttggtggaacttcctcagaagtggtaatgttaggcaaccaaattcattcttccattgttttaaagaaagaatcagaaacactttgcaccatttcaattgatatgagcagtcctaattatctggtaaggaaatatttcaaacaagtttatgcatggactcagttattttgggtatattttttggatctcttttcaagtgtctttcagaatgtatacagtctgtgtacagtagcagtttcttggaaaagcatttttctcaattttgaacaagacttcttagcatccagcagctttccctggtccttgttaaaaaccttaaatgtgcatcaagcatgaaggaaggccctactaacccaaaggtgtgaacttaatgggtattaaacgttctccgccttacaatgggtgcagtttgtgtggaaactttttgccaggttttcagccccagcgtgtggttcacaattcactctattggtggaattagtgcattaggacaagggcaattttcagagtaatcctagggtcaggaggcacattttcagggtcaaatgtaagtgcgtgctgtaattcttctcaagtgagttttgtcctattggctaccctgtcagttttttttgtgagttgtgtgtgcatggtttttaatgaatttgttaggaatatgtatgcagagttagcaacgatggttttttaggactgtattgtacccaattcaagtccaaaatggaattgtcagaccataaagtggtttttcagaatttggcaagcacctcggaggcaaattcagtggtacacaaaagactgtattgtactcaattcaagtccaaaatggaattgtcagaccaccaagaaatttttcagaatttggcaagcacctcagaggcaaattcagtggtacacaaaagaagtgaatataccacatcccggagaacttaaaccttcttcagaaatgctgagtaaagttaaccactctgaagtccttcatcattttggaccaaagcaacgcttttacctgtttgatattaatatttacaatagtgcagaccatctggtaagggaaaccttcaaatcaaccatatatgagtatgtatttcctgtggatttcttttttaaggagtcttttttcaggggataagcctagttatttcagcagcactttaccaaaccaactgaaattatggagcagctttgaaaattatccaagtccttttgtgccaagttagcatcactgtggtaatatccagtccagttaagtggttaaaagttaacctcctagatcctcgaaagagttttgccatacctgtccaagttaagggccatgtgctgtcactcaagtgtagctgaacatgctgtgtcgtagagaaccacatccctcttgtgtatttatgtgccagtgcattgtctataccagtttcaggactgcgcaatgtaccgtgtattttgttgcaacaaatttatcagaacttgtcactctggggtaaaacaaccagcatattgggtgcattttcgacctattcctaaagagcaaattcatctacacacatcttgtattccatgtgttgtgccacctgccttttagatgtcaagtgtgtattgcttcatggaaagtgtctgtctgttcaacaaaaaacagcccactcatcaacaaacttttaagatgaacatttcaaaaaatctggttatgttactccttcaaagtctgtgattgtcttgctcccatgggcagtcccaagtcctagtcactgattctctaagtaggctgatcagaaatttaccagtacattcatggaggtgcaattcacatctgaatccttacttggtggttaaagattcttgtagaaggttaaattaaaggttggttttatgttttggttaatttgtagtatttgtaattagacaattcatatttctcaattaagttatccagggatgaggtaatataaagaccatactgtctctcaagaattgttcctatgtacttgattaagtgacttaattaataagatatagaattaagtttaacctagaagtatcatcaaaaccagtacatactttgtcagaccttacagatattgagtatcaccaagtaatggaagttttttggcttcggcaagtatttgttttggaatgttgccttttttgttgttaatatgaaaagtatgctttgtttgtctgtaattgcgtgtttttcatgtcatgagaatcagtggaagggtttactgtgaaataccatctactaagatttatttctcttagaaatgcatcattaactattgttaacatattagctccatacctatttatgcttgtgtataaagtcaaaactgtgaagagctttgtctgtgattggggggatatttttgttctcttttgattgttaccaggcaaatttgcattttttggatggaatgtcaagaatatgaaaaaggcaagcttaactttctgatctttagtttcattatagattactctttttttcattttgtgatctaacatgcattttgatagtggtagtttgtaattaacctattccacacgtttttttttttaaatttatttatttactgtaactgaaaatcttagttaggttggaatacttcattatgtaatctaggatttattatgggcaaacagtgcatatttttgtcaacctctaaagcaactaatgtgatagatgaagctcagtttcactcatttggtctgtgagcaaaagcagctgggaatactttttacaggataacaatcaaggcttaatcaaaaatttctgctctgaactgcaagttgtcatcatgcagcaatcatttggttgatgcaggtattgattttaactgttgccttttgaattgttaggtctttgaaatattttgcatactctgattaattcacagcgcttttttctggatcaacaattgttgccaggtcatctttgcccaaaatggttatggccttgacttgctgtttgctgtcggtcagttcagtagtttgtcttgttcaaaatttttatgcagaaatctgtgcatatatcctctctacacagtgttgtaatcttgttggaaatgaagggctttcaaatgcatcctcttcatccagttgatacaggtatcgatgtgaccatttggcatctgctttaattgtttctctgagatgcattacatactttgatcaattcatagtgatttctgttggaatatactagaattcaacttaattttgaagttttcttggcatcgactttcaaggctgagttggtcagtgattgaagttcaaaattgtcgcccagaatcatttcatttcatctcttcaagcagatttgtattttcactgggtgttgtcacccttccaagtgctgcctcgtcatttagtttacacaggtattcgcttgaccttttgccacctggagcattttttctttgaaatatattgcatactctgatctttttacagcattttttattgaatgtgctactgttgacagctcaacttggcctaagattttatggccttcactttccggcatgcataggccagttccatgatgtggcagttgaaatcgatgtgtagaaatctgtgtacatttcctctatatacagcttcgttaatcgattggatgcatatgtccttcaaatgcgatcttgtcattgtgttgattcaggtattcacttaacgttttgccttctgaactgtttttcttcgaaatgtatgacatactctggtcaattcacagcaatttctgtttgaatgtggaacggtggacagcccaacttggtcttggaacataatggccgtgacttcttttgtccatgaacgttagttttacgatttgtctagtttcaaactgttatgcagaaatccgtgcattatatcctctttacacagctttgaacatctcctctaagctagttggaatttgtgagcattttggacaaagccagatttttccattccatcggcacgctttcagaagccaattaccagtaaatcttgagatttgtcaagggaaaaaattctaagcaaactggatccgaaaagcagaggtacttcttcttgtcatctttttcaacttcagtgtttctctcgagaccatgtatgtttcgttagtatcttcaatagtttgtccagccaagcgcagcacaattaaggttgaggattaggtaccaacttttgaggtaattgcaatatgaattaccatatcattatttcaaattagtggcaatgtggtgcatgctttgttcagtcttgacagaacttttgttttttggtttgaacttgctataagcataactacacttgggcagctcaactccaatcatgaaayggtttccttgtacaaggtatttccagcatgaattatcacttcaaattattggcaatgcagtggatactttgcgaagtcgacagcatttttgtttctagttcattcatgtgttagtgattccttggttacctttatcatttgagatgtcgtcaaactttggctaaatcttggaaggctttcaagatatcaggtaccaaaatcaagtatctttcaaattccaatttgagtagatgtggtgtatgctctgttcacttttcacagcagtttaatgtgccagatggaatttacattaattgaacttggtcaccttcaagctaaggcctggtgtgaacatcgtggtgtaagtcattacttgccaatctatggacagtcgacaagcactttctcattggtgaatgcaaattttttggacattgtacttgtcagttgtggcatttattccaaaattttttttgtatgtgcaaatgttggatattttgtcaggttccacagtcttctggtaatttttctgtcccactccgaaataaggtggattgtgtgcctgcagtatcgaaagcatcaaacatttcctcttgacagtcagctgtttgtagtctttctaatcacatttgttcagataaactctggtgaactggatacctaaaattcaggtattgaatcttattttttgccactattatggtttgatttcaacatgcatgtcaacactatgtgtgttccatcagtatcttgagcagtttctgctgtggcaagctccacttcaactcggatatacaaagtaagacatgattttagtctcatttgccatcacattcagtttgaaatggttgtctgggaaacagtgtatactttgtaagtctcggcagcgttgtgaacaactcctacaggatgaaggttagttttcagatatgtatttgtttctggtcgcaaaggttttgtatatttctttttcaagtgtagtgaagcattggatgaagg\r", "\r\n", ">Locus_180_Transcript_15/16_Confidence_0.327_Length_6143_transcripts_v2_6|---NA---\r", "\r\n", "gtgaagatgaggaatctaaggaatggcctgtttgcaaattgagttactggatttctctggcaagtagttttggatcaatttttcatactatttttcttgtgtgataagcaaatgattgcattccctgagacagttttttccaggacggtgaaatcagtgaaacaagagcctgtatgtcactagtgactctgagtaaggccttcatttgagcccagcattgcagtggcacttgtttaagttcttggagaagtgactgtaatcaactgatgacctctgcacagcagcaagaactagatgcaggcaaaagttttaaagtccctcagctttgctctgtagcatgctacctaaagttttcagatagtgagaattcatctgatggaagaagctaaatctttggattactatgagagtcagaaagcagtgtgtaacatgggtattactcataatttcaaagtatgtcaaggatgtcattaaggtagtgaacattttcttgagcttttgagtcttttttttctcttgtttattctgaattcggttttcagcagttttgtggaggaatctgtcggcaatgatcagtggtgattttctgccatagtggtggtgttaaatttcaattggcaacttcttgaggtggcattccagcttttagcaaggtgcaagttccactatcacttttagatgttaaatcagcaggcaaacccaaccaatttggacatttcaccattcacgtggctttcagaaaaaggcttgcagcatatcagtaggactggtctgtatgggtctggcctcaggtagctgaatttaactgagataatcctgcagaaaaaaagaggacagaagtacacgtacaatgactgttggtggaacttcctcagaagtggtaatgttaggcaaccaaattcattcttccattgttttaaagaaagaatcagaaacactttgcaccatttcaattgatatgagcagtcctaattatctggtaaggaaatatttcaaacaagtttatgcatggactcagttattttgggtatattttttggatctcttttcaagtgtctttcagaatgtatacagtctgtgtacagtagcagtttcttggaaaagcatttttctcaattttgaacaagacttcttagcatccagcagctttccctggtccttgttaaaaaccttaaatgtgcatcaagcatgaaggaaggccctactaacccaaaggtgtgaacttaatgggtattaaacgttctccgccttacaatgggtgcagtttgtgtggaaactttttgccaggttttcagccccagcgtgtggttcacaattcactctattggtggaattagtgcattaggacaagggcaattttcagagtaatcctagggtcaggaggcacattttcagggtcaaatgtaagtgcgtgctgtaattcttctcaagtgagttttgtcctattggctaccctgtcagttttttttgtgagttgtgtgtgcatggtttttaatgaatttgttaggaatatgtatgcagagttagcaacgatggttttttaggactgtattgtacccaattcaagtccaaaatggaattgtcagaccataaagtggtttttcagaatttggcaagcacctcggaggcaaattcagtggtacacaaaagactgtattgtactcaattcaagtccaaaatggaattgtcagaccaccaagaaatttttcagaatttggcaagcacctcagaggcaaattcagtggtacacaaaagaagtgaatataccacatcccggagaacttaaaccttcttcagaaatgctgagtaaagttaaccactctgaagtccttcatcattttggaccaaagcaacgcttttacctgtttgatattaatatttacaatagtgcagaccatctggtaagggaaaccttcaaatcaaccatatatgagtatgtatttcctgtggatttcttttttaaggagtcttttttcaggggataagcctagttatttcagcagcactttaccaaaccaactgaaattatggagcagctttgaaaattatccaagtccttttgtgccaagttagcatcactgtggtaatatccagtccagttaagtggttaaaagttaacctcctagatcctcgaaagagttttgccatacctgtccaagttaagggccatgtgctgtcactcaagtgtagctgaacatgctgtgtcgtagagaaccacatccctcttgtgtatttatgtgccagtgcattgtctataccagtttcaggactgcgcaatgtaccgtgtattttgttgcaacaaatttatcagaacttgtcactctggggtaaaacaaccagcatattgggtgcattttcgacctattcctaaagagcaaattcatctacacacatcttgtattccatgtgttgtgccacctgccttttagatgtcaagtgtgtattgcttcatggaaagtgtctgtctgttcaacaaaaaacagcccactcatcaacaaacttttaagatgaacatttcaaaaaatctggttatgttactccttcaaagtctgtgattgtcttgctcccatgggcagtcccaagtcctagtcactgattctctaagtaggctgatcagaaatttaccagtacattcatggaggtgcaattcacatctgaatccttacttggtggttaaagattcttgtagaaggttaaattaaaggttggttttatgttttggttaatttgtagtatttgtaattagacaattcatatttctcaattaagttatccagggatgaggtaatataaagaccatactgtctctcaagaattgttcctatgtacttgattaagtgacttaattaataagatatagaattaagtttaacctagaagtatcatcaaaaccagtacatactttgtcagaccttacagatattgagtatcaccaagtaatggaagttttttggcttcggcaagtatttgttttggaatgttgccttttttgttgttaatatgaaaagtatgctttgtttgtctgtaattgcgtgtttttcatgtcatgagaatcagtggaagggtttactgtgaaataccatctactaagatttatttctcttagaaatgcatcattaactattgttaacatattagctccatacctatttatgcttgtgtataaagtcaaaactgtgaagagctttgtctgtgattggggggatatttttgttctcttttgattgttaccaggcaaatttgcattttttggatggaatgtcaagaatatgaaaaaggcaagcttaactttctgatctttagtttcattatagattactctttttttcattttgtgatctaacatgcattttgatagtggtagtttgtaattaacctattccacacgtttttttttttaaatttatttatttactgtaactgaaaatcttagttaggttggaatacttcattatgtaatctaggatttattatgggcaaacagtgcatatttttgtcaacctctaaagcaactaatgtgatagatgaagctcagtttcactcatttggtctgtgagcaaaagcagctgggaatactttttacaggataacaatcaaggcttaatcaaaaatttctgctctgaactgcaagttgtcatcatgcagcaatcatttggttgatgcaggtattgattttaactgttgccttttgaattgttaggtctttgaaatattttgcatactctgattaattcacagcgcttttttctggatcaacaattgttgccaggtcatctttgcccaaaatggttatggccttgacttgctgtttgctgtcggtcagttcagtagtttgtcttgttcaaaatttttatgcagaaatctgtgcatatatcctctctacacagtgttgtaatcttgttggaaatgaagggctttcaaatgcatcctcttcatccagttgatacaggtatcgatgtgaccatttggcatctgctttaattgtttctctgagatgcattacatactttgatcaattcatagtgatttctgttggaatatactagaattcaacttaattttgaagttttcttggcatcgactttcaaggctgagttggtcagtgattgaagttcaaaattgtcgcccagaatcatttcatttcatctcttcaagcagatttgtattttcactgggtgttgtcacccttccaagtgctgcctcgtcatttagtttacacaggtattcgcttgaccttttgccacctggagcattttttctttgaaatatattgcatactctgatctttttacagcattttttattgaatgtgctactgttgacagctcaacttggcctaagattttatggccttcactttccggcatgcataggccagttccatgatgtggcagttgaaatcgatgtgtagaaatctgtgtacatttcctctatatacagcttcgttaatcgattggatgcatatgtccttcaaatgcgatcttgtcattgtgttgattcaggtattcacttaacgttttgccttctgaactgtttttcttcgaaatgtatgacatactctggtcaattcacagcaatttctgtttgaatgtggaacggtggacagcccaacttggtcttggaacataatggccgtgacttcttttgtccatgaacgttagttttacgatttgtctagtttcaaactgttatgcagaaatccgtgcattatatcctctttacacagctttgaacatctcctctaagctagttggaatttgtgagcattttggacaaagccagatttttccattccatcggcacgctttcagaagccaattaccagtaaatcttgagatttgtcaagggaaaaaattctaagcaaactggatccgaaaagcagaggtacttcttcttgtcatctttttcaacttcagtgtttctctcgagaccatgtatgtttcgttagtatcttcaatagtttgtccagccaagcgcagcacaattaaggttgaggattaggtaccaacttttgaggtaattgcaatatgaattaccatatcattatttcaaattagtggcaatgtggtgcatgctttgttcagtcttgacagaacttttgttttttggtttgaacttgctataagcataactacacttgggcagctcaactccaatcatgaaacggtttccttgtacaaggtatttccagcatgaattatcacttcaaattattggcaatgcagtggatactttgcgaagtcgacagcatttttgtttctagttcattcatgtgttagtgattccttggttacctttatcatttgagatgtcgtcaaactttggctaaatcttggaaggctttcaagatatcaggtaccaaaatcaagtatctttcaaattccaatttgagtagatgtggtgtatgctctgttcacttttcacagcagtttaatgtgccagatggaatttacattaattgaacttggtcaccttcaagctaaggcctggtgtgaacatcgtggcgtaagtcattacttgccaatctatggacagtcgacaagctctttctcgttggtgaatgcaaattttttggacattgtacttgtcagttgtggcattcattccaaaattttttttgtatgtgaaaatgttggatattttgtcaggttccacagtcttctggtaatttttctgtcccactccgaaataaggtggattgtgtgcctgcagtatcgaaagcatcaaacatttcctcttgacagtcagctgtttgtagtctttctaatcacatttgttcagataaactctggtgaactggatacctaaaattcaggtattgaatcttattttttgccactattatggtttgatttcaacatgcatgtcaacactatgtgtgttccatcagtatcttgagcagtttctgctgtggcaagctccacttcaactcggatatacaaagtaagacatgattttagtctcatttgccatcacattcagtttgaaatggttgtctgggaaacagtgtatactttgtaagtctcggcagcgttgtgaacaactcctacaggatgaaggttagttttcagatatgtatttgtttctggtcgcaaaggttttgtatatttctttttcaagtgtagtgaagcattggatgaagg\r", "\r\n" ] } ], "source": [ "!head blast2go_fasta_Pdamv2.fasta" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ">Locus_9682_Transcript_1/1_Confidence_1.000_Length_116_transcripts_v2_72920|---NA---\r", "\r\n", "ataaaaataccaggaattgaggaagcaagcagcggcatgccggagcagacaggcaaaaaggaaaaccagaatacgaaaagaaaggaagacgggaatctgcaagacgctttggtgga\r", "\r\n", ">Locus_9570_Transcript_1/1_Confidence_1.000_Length_112_transcripts_v2_72921|---NA---\r", "\r\n", "gctagtttcaggtgtgcattcattgaataaatgtatttgtatttagtacgagtgtataataaagcagtaaatacaaatacatttattcaatgaatgcacacctgaaactagc\r", "\r\n", ">Locus_9787_Transcript_1/1_Confidence_0.714_Length_111_transcripts_v2_72922|---NA---\r", "\r\n", "tgcgtagctcggtggatgtatagagaatgggaattcagtttcagattaggtatgagaccatggatatttgtagnnnnnnnnnnncagcactctcagcacctgttgtagcag\r", "\r\n", ">transcripts_v2_72923|---NA---\r", "\r\n", "ggacgatgaggannnnnnnnnnnnnnctgatgacagtaacgatgatgatcttgatgatgatagcgttgacgagaacgacgaggatgaagactatgaagtga\r", "\r\n", ">Locus_9072_Transcript_1/1_Confidence_0.667_Length_101_transcripts_v2_72924|---NA---\r", "\r\n", "ttcttgaagatttttttaagacaatcgtgttcagttgtaataatttttacataagtaatctaaatattattttttnnnnnnnnnnnnnnnnnagtcaaggg\r", "\r\n" ] } ], "source": [ "!tail blast2go_fasta_Pdamv2.fasta\n", "#FASTA appears to be sorted by contig length" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "72890\r\n" ] } ], "source": [ "!fgrep -c \">\" blast2go_fasta_Pdamv2.fasta" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!blastx \\\n", "-query blast2go_fasta_Pdamv2.fasta \\ #FASTA file\n", "-db ~blast/db/uniprot_sprot \\ #Use your blastx database address\n", "-max_target_seqs 1 \\ #maximum number of target sequences = 1\n", "-max_hsps 1 \\ #maximum number of high-scoring pairs = 1\n", "-outfmt 6 \\ #output format = tabular\n", "-evalue 1E-05 \\ #E-value = 10^-5\n", "-num_threads 8 \\ #number of threads = 8\n", "-out ../analyses/Pdam/Pdam_blastx_uniprot.tab \\ #Direct output to analyses directory\n", "2> ../analyses/Pdam/Pdam_blastx_uniprot.error #Direct standard error output to its own file" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "cd ../../analyses/Pdam" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Locus_1685_Transcript_1/2_Confidence_1.000_Length_7457_transcripts_v2_1|spectrin\tsp|P13395|SPTCA_DROME\t56.44\t2328\t1003\t7\t7455\t490\t91\t2413\t0.0\t 2593\r\n", "Locus_1685_Transcript_2/2_Confidence_1.000_Length_7457_transcripts_v2_2|spectrin\tsp|P02549|SPTA1_HUMAN\t43.93\t2340\t1279\t11\t7455\t484\t97\t2419\t0.0\t 1867\r\n", "Locus_177_Transcript_12/12_Confidence_0.500_Length_6585_transcripts_v2_3|vitellogenin\tsp|Q91062|VIT_ICHUN\t22.09\t860\t593\t22\t3371\t5818\t97\t923\t2e-39\t 166\r\n", "Locus_1199_Transcript_3/4_Confidence_0.750_Length_5569_transcripts_v2_11|serine\tsp|Q627N3|GLC7B_CAEBR\t88.62\t334\t21\t2\t2704\t3699\t2\t320\t0.0\t 609\r\n", "Locus_177_Transcript_9/12_Confidence_0.435_Length_5444_transcripts_v2_12|vitellogenin\tsp|Q91062|VIT_ICHUN\t22.09\t860\t593\t22\t3189\t742\t97\t923\t9e-40\t 166\r\n", "transcripts_v2_15|pyruvate\tsp|Q59754|PPDK_RHIME\t62.84\t557\t207\t0\t1988\t3658\t17\t573\t0.0\t 729\r\n", "transcripts_v2_16|pyruvate\tsp|P22983|PPDK_CLOSY\t55.17\t580\t237\t4\t1988\t3718\t13\t572\t0.0\t 651\r\n", "transcripts_v2_17|pyruvate\tsp|Q59754|PPDK_RHIME\t62.84\t557\t207\t0\t1988\t3658\t17\t573\t0.0\t 729\r\n", "Locus_1762_Transcript_7/14_Confidence_0.340_Length_4399_transcripts_v2_18|dna-binding\tsp|P81270|ERG_MOUSE\t47.61\t355\t152\t8\t2562\t3611\t121\t446\t2e-86\t 296\r\n", "Locus_1762_Transcript_10/14_Confidence_0.340_Length_4397_transcripts_v2_19|dna-binding\tsp|Q8R4Z4|ETV3_MOUSE\t50.00\t140\t62\t2\t3159\t3575\t28\t160\t9e-29\t 126\r\n" ] } ], "source": [ "!head -10 Pdam_blastx_uniprot.tab" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Locus_9332_Transcript_1/1_Confidence_1.000_Length_172_transcripts_v2_72846|---NA---\tsp|O00160|MYO1F_HUMAN\t68.75\t32\t10\t0\t170\t75\t1065\t1096\t1e-10\t60.1\r\n", "Locus_935_Transcript_2/3_Confidence_0.667_Length_170_transcripts_v2_72855|elongation\tsp|P26642|EF1GA_XENLA\t56.41\t39\t17\t0\t3\t119\t365\t403\t2e-07\t50.1\r\n", "Locus_9009_Transcript_1/1_Confidence_1.000_Length_169_transcripts_v2_72864|---NA---\tsp|Q8N2G6|ZCH24_HUMAN\t62.50\t32\t12\t0\t72\t167\t149\t180\t6e-08\t50.4\r\n", "Locus_9512_Transcript_1/1_Confidence_1.000_Length_165_transcripts_v2_72886|nadh\tsp|P05510|NU5M_NEUCR\t100.00\t53\t0\t0\t2\t160\t583\t635\t3e-27\t 107\r\n", "transcripts_v2_72898|elongation\tsp|P54412|EF1G_CAEEL\t53.85\t52\t24\t0\t3\t158\t329\t380\t2e-10\t58.5\r\n", "transcripts_v2_72908|---NA---\tsp|Q9TM69|PSBA_ALETA\t100.00\t18\t0\t0\t5\t58\t128\t145\t6e-06\t45.4\r\n", "transcripts_v2_72909|hypothetical\tsp|O97341|CALM_SUBDO\t84.85\t33\t5\t0\t1\t99\t105\t137\t2e-11\t58.9\r\n", "Locus_9529_Transcript_1/1_Confidence_1.000_Length_136_transcripts_v2_72910|p700\tsp|Q9XQV2|PSAB_HETTR\t78.05\t41\t9\t0\t14\t136\t546\t586\t1e-13\t68.2\r\n", "Locus_9530_Transcript_1/1_Confidence_1.000_Length_118_transcripts_v2_72918|---NA---\tsp|P43974|Y258_HAEIN\t72.97\t37\t10\t0\t3\t113\t2\t38\t5e-09\t53.5\r\n", "Locus_9787_Transcript_1/1_Confidence_0.714_Length_111_transcripts_v2_72922|---NA---\tsp|Q9BBC1|PSBA_AMPCA\t82.61\t23\t4\t0\t4\t72\t119\t141\t4e-06\t45.1\r\n" ] } ], "source": [ "!tail -10 Pdam_blastx_uniprot.tab" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 20605 247260 2784390 /Users/jay/Desktop/blast_jobs/Pdam_blastx_uniprot.tab\r\n" ] } ], "source": [ "!wc Pdam_blastx_uniprot.tab" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Locus_1685_Transcript_1/2_Confidence_1.000_Length_7457_transcripts_v2_1|spectrin\tsp|P13395|SPTCA_DROME\t56.44\t2328\t1003\t7\t7455\t490\t91\t2413\t0.0\t 2593\n", "SQLShare ready version has Pipes converted to Tabs ....\n", "Locus_1685_Transcript_1/2_Confidence_1.000_Length_7457_transcripts_v2_1\tspectrin\tsp\tP13395\tSPTCA_DROME\t56.44\t2328\t1003\t7\t7455\t490\t91\t2413\t0.0\t 2593\n" ] } ], "source": [ "#Removing pipes and converted to tab-delimited file\n", "!tr '|' \"\\t\" Pdam_blastx_uniprot_sql.tab\n", "!head -1 Pdam_blastx_uniprot.tab\n", "!echo SQLShare ready version has Pipes converted to Tabs ....\n", "!head -1 Pdam_blastx_uniprot.sql.tab" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Manually uploading Pdam_blastx_uniprot_sql.tab to SQLShare and joining with GOSlim:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###First upload dataset\n", "![screen shot1](https://github.com/jldimond/Coral-CpG-ratio-MS/blob/master/images/Screen%20Shot%202015-09-25%20at%2012.01.38%20PM.png?raw=true)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "###Then find the dataset, execute query, and download the new dataset\n", "![screen shot](https://github.com/jldimond/Coral-CpG-ratio-MS/blob/master/images/Screen%20Shot%202015-09-25%20at%2012.29.18%20PM.png?raw=true)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##Query (note: insert your SQLShare account instead of jldimond@washington.edu)\n", "`SELECT Distinct Column1 as ContigID, GOSlim_bin\n", " FROM [jldimond@washington.edu].[Pdam_blastx_uniprot_sql_1.tab]anno\n", " left join [sr320@washington.edu].[SPID and GO Numbers]go\n", " on anno.Column3=go.SPID or anno.Column4=go.SPID\n", " left join [sr320@washington.edu].[GO_to_GOslim]slim\n", " on go.GOID=slim.GO_id where aspect like 'P'`" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##Output file downloaded to ./analyses/Pdam/Pdam_GOSlim.csv" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ContigID,GOSlim_bin\r", "\r\n", "Locus_1_Transcript_1/1_Confidence_1.000_Length_225_transcripts_v2_8356,protein metabolism\r", "\r\n", "Locus_1_Transcript_1/1_Confidence_1.000_Length_225_transcripts_v2_8356,other biological processes\r", "\r\n", "Locus_1000_Transcript_1/1_Confidence_1.000_Length_292_transcripts_v2_6389,developmental processes\r", "\r\n", "Locus_1000_Transcript_1/1_Confidence_1.000_Length_292_transcripts_v2_6389,other biological processes\r", "\r\n", "Locus_10004_Transcript_1/1_Confidence_1.000_Length_174_transcripts_v2_10976,transport\r", "\r\n", "Locus_10024_Transcript_1/1_Confidence_1.000_Length_411_transcripts_v2_4529,signal transduction\r", "\r\n", "Locus_1003_Transcript_1/1_Confidence_1.000_Length_421_transcripts_v2_4420,protein metabolism\r", "\r\n", "Locus_10037_Transcript_1/1_Confidence_1.000_Length_428_transcripts_v2_4337,other metabolic processes\r", "\r\n", "Locus_10043_Transcript_1/1_Confidence_1.000_Length_339_transcripts_v2_5512,other metabolic processes\r", "\r\n" ] } ], "source": [ "!head -10 Pdam_GOSlim.csv" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#Replacing commas with tabs \n", "!tr ',' \"\\t\" Pdam_GOSlim.tab" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ContigID\tGOSlim_bin\r", "\r\n", "Locus_1_Transcript_1/1_Confidence_1.000_Length_225_transcripts_v2_8356\tprotein metabolism\r", "\r\n", "Locus_1_Transcript_1/1_Confidence_1.000_Length_225_transcripts_v2_8356\tother biological processes\r", "\r\n", "Locus_1000_Transcript_1/1_Confidence_1.000_Length_292_transcripts_v2_6389\tdevelopmental processes\r", "\r\n", "Locus_1000_Transcript_1/1_Confidence_1.000_Length_292_transcripts_v2_6389\tother biological processes\r", "\r\n", "Locus_10004_Transcript_1/1_Confidence_1.000_Length_174_transcripts_v2_10976\ttransport\r", "\r\n", "Locus_10024_Transcript_1/1_Confidence_1.000_Length_411_transcripts_v2_4529\tsignal transduction\r", "\r\n", "Locus_1003_Transcript_1/1_Confidence_1.000_Length_421_transcripts_v2_4420\tprotein metabolism\r", "\r\n", "Locus_10037_Transcript_1/1_Confidence_1.000_Length_428_transcripts_v2_4337\tother metabolic processes\r", "\r\n", "Locus_10043_Transcript_1/1_Confidence_1.000_Length_339_transcripts_v2_5512\tother metabolic processes\r", "\r\n" ] } ], "source": [ "!head -10 Pdam_GOSlim.tab" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.9" } }, "nbformat": 4, "nbformat_minor": 0 }