#QC pipeline for simulated data /opt/gaoyan/script/realdata ##classification sh classification.sh SAMPLE_NAME reads.fa return species number ##concoct sh concoct.sh SAMPLE_NAME contigs.fa PEreads1 PEreads2 species_number generate concoct_SAMPLE_NAME/contigs/megahit_c10K.fa, concoct_SAMPLE_NAME/output/clustering_gt1000.csv output directory: concoct_SAMPLE_NAME ##reads mapping contigs sh reads_mapping.sh SAMPLE_NAME contigs_cutting.fa PEreads1 PEreads2 generate reads_mapping_contigs_SAMPLE_NAME/out.F output directory: reads_mapping_contigs_SAMPLE_NAME ##find target_cluster sh target_find.sh contigs_cutting.fa marker.fa clustering_gt1000.csv SAMPLE_NAME generate marker_blast_SAMPLE_NAME/target output directory: marker_blast_SAMPLE_NAME ##analysis clusters sh blast_megahit.sh contigs_cutting.fa blast_result_file clustering_gt1000.csv out.F DIR_NAME generate DIR_NAME/contig_list, DIR_NAME/contig_species, DIR_NAME/cluster_s* DIR_NAME/count, DIR_NAME/count_read output directory: DIR_NAME ##evaluation sh target_evaluation.sh clusterN(target cluster) contig_species cluster_sN(target cluster) out.F DIR_NAME generate DIR_NAME/result output directory: DIR_NAME ##dataloss sh dataloss.sh contigs_list clustering_gt1000.csv out.F DIR_NAME generate DIR_NAME/result_dataloss output directory: DIR_NAME ##extract target reads sh findclusterreads.sh cluster_sN(target cluster) out.F SAMPLE_NAME generate target_reads_SAMPLE_NAME/clusterreads output directory: target_reads_SAMPLE_NAME python extract.py generate target_reads.fa ##coverage python coverage_percent.py return percent of positions with positive coverage **scripts could be found in /home/xw/QC **tools utilized by us Parallel_Meta http://computationalbioenergy.org/parallel-meta.html MEGAHIT https://github.com/voutcn/megahit CONCOCT https://github.com/BinPro/CONCOCT BLAST https://blast.ncbi.nlm.nih.gov/Blast.cgi