In light of my recent post using a brute-force string searching algorithm, I decided to post an implementation of the Knuth-Morris-Pratt algorithm in Perl. This implementation is essentially straight from the pseudo-code found on the wiki.
#!/usr/bin/perl
use strict;
use warnings;
use Data::Dumper;
my $string = "ABC ABCDAB ABCDABCDABDE";
my $pattern = "ABCDABD";
my $table_array = build_table($pattern);
print "found pattern at index: "
. search_string( $string, $pattern, $table_array ),"\n";
sub search_string {
my $string = shift;
my $pattern = shift;
my $table_array = shift;
my $m = 0; # beginning of current match
my $i = 0; # the position of the current character in pattern sought
my @split_string = split(//, $string);
my @split_pattern = split(//, $pattern);
while ( $m + $i < scalar(@split_string) ) {
if ( $split_pattern[$i] eq $split_string[ $m + $i] ) {
if ( $i == scalar(@split_pattern) - 1 ) {
return $m;
}
$i++;
}
else {
if ( @{$table_array}[$i] > -1 ) {
$m = $m + $i - @{$table_array}[$i];
$i = @{$table_array}[$i];
}
else {
$i = 0;
$m++;
}
}
}
return length($string);
}
sub build_table {
my $string = shift;
my @split_string = split (//, $string);
my $pos = 2;
my $cnd = 0;
@{$table_array}[0] = -1;
@{$table_array}[1] = 0;
while ( $pos < scalar(@split_string) ) {
if ( $split_string[$pos-1] eq $split_string[$cnd] ) {
$cnd++;
@{$table_array}[$pos] = $cnd;
$pos++;
}
elsif ( $cnd > 0 ) {
$cnd = @{$table_array}[$cnd];
}
else {
@{$table_array}[$pos] = 0;
$pos++;
}
}
return $table_array;
}
It should be noted, however, that the index() function in Perl uses the Boyer-Moore algorithm under the hood- so implementing a string searching function like the following may be an easier, and faster solution. It takes a pattern and string (to search within) as arguments.
use strict;
use warnings;
use Data::Dumper;
my $matches = occurrences('CGATGGTCG',
'TCGATGGTAAATACTGTGCGATGGTCGATGGTTCGATGGTCGATGGTCGGGACGATGGTGGGCGATGGTGCGATGGTTCGATGGTACGATGGTCGATGGTACGATGGTCAGGGCGATGGTTAACGCGATGGTGGCAGTCGATGGTTGCGATGGTTCGATGGTCCCGATGGTGCGACGATGGTATTCCGATGGTTCGATGGTCGATGGTACTGCGATGGTCGATGGTACATCGATGGTATCCGATGGTCGATGGTGGCGATGGTCGATGGTCGATGGTCGATGGTGTTATCGATGGTCCGATGGTCGATGGTTAGCGATGGTTATAGGTATCCCGATGGTCGATGGTCGATGGTTACGATGGTCCGATGGTCGATGGTCTTTGTCGATGGTTCGATGGTCGATGGTAACGATGGTCGATGGTTTGTCGATGGTCGCGATGGTCGCCGATGGTGCCGATGGTGGGTCGATGGTGCTCGATGGTCGATGGTCCGCGATGGTTGCGTCGATGGTCGATGGTCGATGGTGGACTCGATGGTCACGATGGTTTCTCGATGGTGGTTCCGATGGTCGATGGTGTCGATGGTACGCAAGTACAGATAGTGCGATGGTGAGGATAGTGCGATGGTAGCGATGGTCGCGATGGTCGATGGTTACTTGCCTGCGATGGTGTGTACGATGGTCGGAACGCCCGATGGTGACGATGGTCATGCGATGGTATTCAATTCGATGGTCTCCGGCCGAAGAAAGCGATGGTCCCAAGATGATCGATGGTCGATGGTCGATGGTGTCGATGGTCCGATGGTCCGTTTCGATGGTACTTCGATGGTTTGCGATGGTATATGTCGATGGTCCAACGATGGTGGTGCGATGGTCTGCGATGGTA');
print join(' ', @{$matches});
sub occurrences {
my( $x, $y ) = @_;
my $pos = 0;
my $matches = 0;
my @locations;
while (1) {
$pos = index($y, $x, $pos);
last if($pos < 0);
$matches++;
$pos++;
push @locations, $pos;
}
return \@locations;
}