#!/usr/bin/perl -w #-------------------------------------------------------------# # kmer_counting_in_FASTQ.pl - By Jacob Shreve # # Bioinformatics Core, Purdue University # # v1.0.0 # #-------------------------------------------------------------# # The purpose of this script is to read-in a FASTQ file line- # by-line and record every kmer found as defined by the user. # All kmers are recorded and quantified, with several metrics # reported after the FASTQ has been fully transversed. # Usage: perl kmer_counting_in_FASTQ.pl / # sample_file.FASTQ # kmer # Example: perl kmer_counting_in_FASTQ.pl 01928_1.fastq 7 #---- STEP1: build a hash with all kmers from the reads ----# $position = 0; $reads = 0; open(FASTQ,$ARGV[0]) || die("Can't open the FASTQ file."); while() { $position++; if($position == 2) { $reads++; $_ =~ s/\n//; for($i=0; $i<=(length($_)-$ARGV[1]); $i++) { $kmer = substr($_, $i, $ARGV[1]); if(not defined $HASH{$kmer}) { $HASH{$kmer} = 1; } else { $HASH{$kmer} += 1; } } } if($position == 4) { $position = 0; } } close(FASTQ); #---- STEP2: scan through the hash to calculate metrics ----# $count = 0; $running_total = 0; $top_5 = ""; $bottom_5 = ""; foreach (sort { $HASH{$b} <=> $HASH{$a} } keys(%HASH) ) { $count++; $running_total += $HASH{$_}; if($count <= 5) { $top_5 .= "$HASH{$_}\t$_\n"; } if($count == 10) { $at_10 = $running_total; } if($count == 100) { $at_100 = $running_total; } if($count == 1000) { $at_1000 = $running_total; } if($count == 10000) { $at_10000 = $running_total; } } for($q=-1; $q>=-5; $q--) { $bottom_key = (sort { $HASH{$b} <=> $HASH{$a} } keys(%HASH))[$q]; $bottom_5 .= "$HASH{$bottom_key}\t$bottom_key\n"; } #---- STEP3: report the metrics ----# print "Used $0 to find kmer size $ARGV[1] metrics\n"; print "Reads file: $ARGV[0]\n\n"; print "Total number of reads: $reads\n"; print "Total number of kmers: ".(keys %HASH)."\n\n"; print "Top 10 kmers combined: $at_10/$running_total: ".sprintf("%.2f", (($at_10/$running_total)*100))."% of all kmers\n"; print "Top 100 kmers combined: $at_100/$running_total: ".sprintf("%.2f", (($at_100/$running_total)*100))."% of all kmers\n"; print "Top 1000 kmers combined: $at_1000/$running_total: ".sprintf("%.2f", (($at_1000/$running_total)*100))."% of all kmers\n"; print "Top 10000 kmers combined: $at_10000/$running_total: ".sprintf("%.2f", (($at_10000/$running_total)*100))."% of all kmers\n"; print "\nMost common 5 kmers:\n$top_5\nLeast common 5 kmers:\n$bottom_5\n";