# # Copyright (c) 2017 10X Genomics, Inc. All rights reserved. # # # Copyright (c) 2017 10X Genomics, Inc. All rights reserved. # # # Copyright (c) 2015 10X Genomics, Inc. All rights reserved. # stage ASSEMBLER_PREFLIGHT( in string input_mode, in map[] sample_def, in string barcode_whitelist, in map downsample, in float loading_mass, in int genome_size, in bool check_executables, src py "stages/preflight/denovo", ) # # Copyright (c) 2017 10X Genomics, Inc. All rights reserved. # # # Copyright (c) 2015 10X Genomics, Inc. All rights reserved. # filetype fastb; filetype qualp; filetype bci; filetype fastq.gz; filetype json; filetype script; filetype txt; filetype csv; filetype bv; stage FASTQ_TO_FASTBQUALP( in fastq.gz[] in_reads, out fastb out_reads, out qualp out_quals, out bci out_bci, src py "stages/denovo/assembly_prep", ) split using ( in fastq.gz in_reads_file, ) stage DETECT_PLATFORMS( in map[] sample_def, in string fastq_mode, out string sequencers, src py "stages/denovo/detect_platforms", ) stage ASSEMBLER_DF( in string pipeline_id, in string sample_id, in string sample_desc, in string sequencers, in fastb reads, in qualp quals, in bci bci, in map downsample, in map addin, in int maxcores, in string known_sample_id, in bool nodebugmem, in bv mspedges, in float loading_mass, in int genome_size, in string quit_stage, in int largemem_gb, out path, src py "stages/denovo/df", ) split using ( ) stage ASSEMBLER_TR( in string sample_id, in string sample_desc, in path parent_dir, in string known_sample_id, in bool nodebugmem, in map addin, in int maxcores, in string quit_stage, in int largemem_gb, out path, src py "stages/denovo/tr", ) split using ( ) stage ASSEMBLER_MC( in string sample_id, in string sample_desc, in path parent_dir, in string known_sample_id, in bool nodebugmem, in map addin, in int maxcores, in string quit_stage, in int largemem_gb, out path, src py "stages/denovo/mc", ) split using ( ) stage ASSEMBLER_CP( in string sample_id, in string sample_desc, in path parent_dir, in string known_sample_id, in map addin, in int maxcores, in bool nodebugmem, in string quit_stage, in int largemem_gb, out path, src py "stages/denovo/cp", ) split using ( ) stage ASSEMBLER_ML( in string sample_id, in string sample_desc, in path parent_dir, in string known_sample_id, in bool nodebugmem, in map addin, in int maxcores, in string quit_stage, in int largemem_gb, out path, src py "stages/denovo/ml", ) split using ( ) stage ASSEMBLER_CL( in string sample_id, in string sample_desc, in path parent_dir, in string known_sample_id, in bool nodebugmem, in map addin, in int maxcores, in string quit_stage, in int largemem_gb, out path, src py "stages/denovo/cl", ) split using ( ) stage ASSEMBLER_DM( in string sample_id, in string sample_desc, in path parent_dir, in string known_sample_id, in bool nodebugmem, in map addin, in int maxcores, in string quit_stage, in int largemem_gb, out path, src py "stages/denovo/dm", ) split using ( ) stage ASSEMBLER_ACP( in string sample_id, in string sample_desc, in path parent_dir, in string known_sample_id, in bool nodebugmem, in map addin, in int maxcores, in string quit_stage, in int largemem_gb, out path, src py "stages/denovo/acp", ) split using ( ) stage ASSEMBLER_MP( in string sample_id, in string sample_desc, in path parent_dir, in string known_sample_id, in bool nodebugmem, in map addin, in int maxcores, in string quit_stage, in int largemem_gb, out path, src py "stages/denovo/mp", ) split using ( ) stage ASSEMBLER_M2( in string sample_id, in string sample_desc, in path parent_dir, in string known_sample_id, in bool nodebugmem, in map addin, in int maxcores, in string quit_stage, in int largemem_gb, out path, src py "stages/denovo/m2", ) split using ( ) stage ASSEMBLER_PR( in string sample_id, in string sample_desc, in path parent_dir, in string known_sample_id, in map addin, in int maxcores, in bool nodebugmem, in string quit_stage, in int largemem_gb, out path, out csv summary_cs, out txt report, src py "stages/denovo/pr", ) split using ( ) # # Copyright (c) 2017 10X Genomics, Inc. All rights reserved. # # # Copyright (c) 2017 10X Genomics, Inc. All rights reserved. # filetype txt; filetype fastq; filetype fastq.gz; filetype rsd; stage BUCKET_FASTQS( in txt barcode_whitelist_path, in int trim_length, in map[] chunks, in float max_expected_barcode_errors, in float bc_confidence_threshold, in int requested_read_pairs, out int total_reads, out float final_subsample_rate, out rsd[] no_bc_read_buckets, out rsd[] read_buckets, out rsd bc_counts, src exec "tada martian bucket-bcs", ) split using ( in int which, ) stage SORT_FASTQS( in txt barcode_whitelist_path, in rsd[] read_buckets, in float subsample_rate, out fastq.gz[] reads, src exec "tada martian sort-bcs", ) split using ( in int chunk_id, in int total_chunks, ) # # Copyright (c) 2015 10X Genomics, Inc. All rights reserved. # filetype fastq; filetype bam; filetype bam.bai; filetype bed; filetype json; filetype fastq.gz; filetype txt; stage SETUP_CHUNKS( in string sample_id, in map[] sample_def "list of dictionary specifying input data", in string input_mode "configuration of the input fastqs", in string barcode_whitelist, in map downsample, out map[] chunks "map has barcode, barcode_reverse_complement, sample_index, read1, read2, gem_group, and read group fields", out string[] read_groups "list of strings representing read groups", out json downsample_info, out txt barcode_whitelist_path, out int requested_read_pairs, src py "stages/reads/setup_chunks", ) pipeline _FASTQ_PREP_NEW( in string fastq_mode "configuration of the input fastqs", in map[] sample_def, in int trim_length, in string sample_id, in map downsample, in string barcode_whitelist, out fastq.gz[] reads, out string[] read_groups, out json lot_info, out json downsample_info, out txt barcode_whitelist_path, ) { call local volatile SETUP_CHUNKS( sample_id = self.sample_id, downsample = self.downsample, input_mode = self.fastq_mode, sample_def = self.sample_def, barcode_whitelist = self.barcode_whitelist, ) call volatile BUCKET_FASTQS( trim_length = self.trim_length, chunks = SETUP_CHUNKS.chunks, barcode_whitelist_path = SETUP_CHUNKS.barcode_whitelist_path, max_expected_barcode_errors = 1.0, bc_confidence_threshold = 0.975, requested_read_pairs = SETUP_CHUNKS.requested_read_pairs, ) call volatile SORT_FASTQS( barcode_whitelist_path = SETUP_CHUNKS.barcode_whitelist_path, read_buckets = BUCKET_FASTQS.read_buckets, subsample_rate = BUCKET_FASTQS.final_subsample_rate, ) return ( reads = SORT_FASTQS.reads, read_groups = SETUP_CHUNKS.read_groups, lot_info = null, #FIXME downsample_info = SETUP_CHUNKS.downsample_info, barcode_whitelist_path = SETUP_CHUNKS.barcode_whitelist_path, ) } filetype fastb;# binary sequence file filetype qualp;# packed quality scores filetype bci; # barcode index pipeline _ASSEMBLER_PREP( in string sample_id, in string fastq_mode "configuration of the input fastqs", in map[] sample_def, in string barcode_whitelist "name of barcode whitelist file", in int trim_length, in map downsample, out fastb reads, out qualp quals, out bci bci, out fastq.gz[] fqreads, out txt barcode_whitelist_path, out string sequencers, ) { call _FASTQ_PREP_NEW( fastq_mode = self.fastq_mode, sample_def = self.sample_def, trim_length = self.trim_length, sample_id = self.sample_id, downsample = self.downsample, barcode_whitelist = self.barcode_whitelist, ) call volatile DETECT_PLATFORMS( fastq_mode = self.fastq_mode, sample_def = self.sample_def, ) call volatile FASTQ_TO_FASTBQUALP( in_reads = _FASTQ_PREP_NEW.reads, ) return ( reads = FASTQ_TO_FASTBQUALP.out_reads, quals = FASTQ_TO_FASTBQUALP.out_quals, bci = FASTQ_TO_FASTBQUALP.out_bci, fqreads = _FASTQ_PREP_NEW.reads, barcode_whitelist_path = _FASTQ_PREP_NEW.barcode_whitelist_path, sequencers = DETECT_PLATFORMS.sequencers, ) } # # Copyright (c) 2017 10X Genomics, Inc. All rights reserved. # # # Copyright (c) 2017 10X Genomics, Inc. All rights reserved. # filetype fofn; filetype txt; filetype bv; filetype fastq; filetype fastq.gz; filetype perm; filetype msp; filetype sedge_asm; filetype sedge_bcs; filetype graph; filetype node_bcs; stage MSP( in int trim_min_qual, in fastq.gz[] fastqs, in txt barcode_whitelist, out msp[] chunks, src exec "tada martian msp", ) split using ( in perm permutation, in fastq.gz chunk, ) stage SHARD_ASM( in int min_kmer_obs, in msp[] chunks, out sedge_asm[] sedge_asm, out sedge_bcs[] sedge_bcs, src exec "tada martian shard-asm", ) split using ( in int chunk_id, in int total_chunks, ) stage MAIN_ASM_SN( in sedge_asm[] sedge_asm, in sedge_bcs[] sedge_bcs, out bv asm_graph, src exec "tada martian main-asm-sn", ) split using () pipeline _ASM_SN( in int min_kmer_obs, in fastq.gz[] fastqs, in txt barcode_whitelist, out bv asm_graph, ) { call volatile MSP( trim_min_qual = 7, fastqs = self.fastqs, barcode_whitelist = self.barcode_whitelist, ) call volatile SHARD_ASM( min_kmer_obs = self.min_kmer_obs, chunks = MSP.chunks, ) call volatile MAIN_ASM_SN( sedge_asm = SHARD_ASM.sedge_asm, sedge_bcs = SHARD_ASM.sedge_bcs, ) return ( asm_graph = MAIN_ASM_SN.asm_graph, ) } pipeline _ASSEMBLER( in string pipeline_id, in string sample_id, in string fastq_mode "configuration of the input fastqs", in string sample_desc, in map[] sample_def, in string barcode_whitelist "name of barcode whitelist file", in int trim_length, in string known_sample_id, in map downsample, in bool nodebugmem, in map addin, in int maxcores, in float loading_mass, in int genome_size, in string quit_stage, in int largemem_gb, out path assembly "Raw assembly files", out csv summary "Run summary", out txt report "Run report", ) { call preflight ASSEMBLER_PREFLIGHT( input_mode = self.fastq_mode, sample_def = self.sample_def, barcode_whitelist = self.barcode_whitelist, downsample = self.downsample, loading_mass = self.loading_mass, genome_size = self.genome_size, check_executables = true, ) call _ASSEMBLER_PREP( sample_id = self.sample_id, fastq_mode = self.fastq_mode, sample_def = self.sample_def, barcode_whitelist = self.barcode_whitelist, trim_length = self.trim_length, downsample = self.downsample, ) call _ASM_SN( min_kmer_obs = 3, fastqs = _ASSEMBLER_PREP.fqreads, barcode_whitelist = _ASSEMBLER_PREP.barcode_whitelist_path, ) call volatile ASSEMBLER_DF( pipeline_id = self.pipeline_id, sample_id = self.sample_id, sample_desc = self.sample_desc, sequencers = _ASSEMBLER_PREP.sequencers, reads = _ASSEMBLER_PREP.reads, quals = _ASSEMBLER_PREP.quals, bci = _ASSEMBLER_PREP.bci, downsample = self.downsample, nodebugmem = self.nodebugmem, loading_mass = self.loading_mass, genome_size = self.genome_size, mspedges = _ASM_SN.asm_graph, known_sample_id = self.known_sample_id, addin = self.addin, maxcores = self.maxcores, quit_stage = self.quit_stage, largemem_gb = self.largemem_gb, ) call volatile ASSEMBLER_TR( sample_id = self.sample_id, sample_desc = self.sample_desc, parent_dir = ASSEMBLER_DF, known_sample_id = self.known_sample_id, nodebugmem = self.nodebugmem, addin = self.addin, maxcores = self.maxcores, quit_stage = self.quit_stage, largemem_gb = self.largemem_gb, ) call volatile ASSEMBLER_MC( sample_id = self.sample_id, sample_desc = self.sample_desc, parent_dir = ASSEMBLER_TR, known_sample_id = self.known_sample_id, nodebugmem = self.nodebugmem, addin = self.addin, maxcores = self.maxcores, quit_stage = self.quit_stage, largemem_gb = self.largemem_gb, ) call volatile ASSEMBLER_CP( sample_id = self.sample_id, sample_desc = self.sample_desc, parent_dir = ASSEMBLER_MC, known_sample_id = self.known_sample_id, addin = self.addin, nodebugmem = self.nodebugmem, maxcores = self.maxcores, quit_stage = self.quit_stage, largemem_gb = self.largemem_gb, ) call volatile ASSEMBLER_ML( sample_id = self.sample_id, sample_desc = self.sample_desc, parent_dir = ASSEMBLER_CP, known_sample_id = self.known_sample_id, addin = self.addin, nodebugmem = self.nodebugmem, maxcores = self.maxcores, quit_stage = self.quit_stage, largemem_gb = self.largemem_gb, ) call volatile ASSEMBLER_CL( sample_id = self.sample_id, sample_desc = self.sample_desc, parent_dir = ASSEMBLER_ML, known_sample_id = self.known_sample_id, addin = self.addin, nodebugmem = self.nodebugmem, maxcores = self.maxcores, quit_stage = self.quit_stage, largemem_gb = self.largemem_gb, ) call volatile ASSEMBLER_DM( sample_id = self.sample_id, sample_desc = self.sample_desc, parent_dir = ASSEMBLER_CL, known_sample_id = self.known_sample_id, addin = self.addin, nodebugmem = self.nodebugmem, maxcores = self.maxcores, quit_stage = self.quit_stage, largemem_gb = self.largemem_gb, ) call volatile ASSEMBLER_ACP( sample_id = self.sample_id, sample_desc = self.sample_desc, parent_dir = ASSEMBLER_DM, known_sample_id = self.known_sample_id, addin = self.addin, nodebugmem = self.nodebugmem, maxcores = self.maxcores, quit_stage = self.quit_stage, largemem_gb = self.largemem_gb, ) call volatile ASSEMBLER_MP( sample_id = self.sample_id, sample_desc = self.sample_desc, parent_dir = ASSEMBLER_ACP, known_sample_id = self.known_sample_id, addin = self.addin, nodebugmem = self.nodebugmem, maxcores = self.maxcores, quit_stage = self.quit_stage, largemem_gb = self.largemem_gb, ) call volatile ASSEMBLER_M2( sample_id = self.sample_id, sample_desc = self.sample_desc, parent_dir = ASSEMBLER_MP, known_sample_id = self.known_sample_id, addin = self.addin, nodebugmem = self.nodebugmem, maxcores = self.maxcores, quit_stage = self.quit_stage, largemem_gb = self.largemem_gb, ) call ASSEMBLER_PR( sample_id = self.sample_id, sample_desc = self.sample_desc, parent_dir = ASSEMBLER_M2, known_sample_id = self.known_sample_id, addin = self.addin, nodebugmem = self.nodebugmem, maxcores = self.maxcores, quit_stage = self.quit_stage, largemem_gb = self.largemem_gb, ) return( assembly = ASSEMBLER_PR, summary = ASSEMBLER_PR.summary_cs, report = ASSEMBLER_PR.report, ) } pipeline ASSEMBLER_CS( in string sample_id, in string fastq_mode "configuration of the input fastqs", in map[] sample_def, in string sample_desc, in map downsample, in bool nodebugmem, in float loading_mass, in int genome_size, out csv summary "Run summary", out txt report "Run report", out path assembly "Raw assembly files", ) { call _ASSEMBLER( pipeline_id = null, sample_id = self.sample_id, fastq_mode = self.fastq_mode, sample_desc = self.sample_desc, sample_def = self.sample_def, loading_mass = self.loading_mass, genome_size = self.genome_size, barcode_whitelist = "4M-with-alts-february-2016", trim_length = 7, downsample = self.downsample, addin = null, known_sample_id = null, nodebugmem = self.nodebugmem, maxcores = 64, quit_stage = null, largemem_gb = null, ) return ( assembly = _ASSEMBLER.assembly, summary = _ASSEMBLER.summary, report = _ASSEMBLER.report, ) } call ASSEMBLER_CS( fastq_mode = "ILMN_BCL2FASTQ", sample_id = "Geoduck", sample_def = [ { "gem_group": null, "lanes": null, "read_path": "/gscratch/scrubbed/sr320/Chrom", "sample_indices": [ "any" ], "sample_names": [ "Geoduck-1" ], "library": null, "bc_in_read": 1, "bc_length": 16 } ], sample_desc = "", downsample = { "target_reads": 1200000000 }, loading_mass = null, genome_size = null, nodebugmem = false, )