function [result, K] = p_spectrum_fast(s,t,p) %P_SPECTRUM_FAST % -Finds the contiguous subsequence match count between strings s and t % by using a dynamic programming implementation, % where the length of the subsequence is p. % *(There is also a brute force implementation of this algorithm. % Type help p_spectrum_fast_bf for info.) % -Is faster than p_spectrum b/c this program only tries to fill in the last % index of the matrix, rather than the whole structure. % % -Simply prompting the function will return the value K(s,t), however % using the function as [result,K] = K(s,t) will also return the matrix K. % % -The following algorithm is used: % K[p](sa,t) = K[p](s,t) + [Summation of i from 1 to |t|] G[p-1](s,t(1:i-1)) [t(i) == a] % K[p](s,t) = 0 if |s| < p or |t| < p % G[p](sa, tb) = G[p-1](s,t)[a==b] % G[0](s,t) = 1 for all s,t % G[p](s,t) = 0 if |s| == 0 or |t| == 0 % % % -Example: p_spectrum_fast('abccc','abc', 3) returns a value of 1. % (Note that p_spectrum_fast('abccc','abc',3)=p_spectrum_fast('abc','abccc',3) since K(s,t,p) = K(t,s,p) ). % -Example: p_spectrum_fast('a','a', 1) returns a value of 1. % -Example: p_spectrum_fast('a','b', 1) returns a value of 0. % -Example: p_spectrum_fast('ab','ab', 2) returns a value of 1. % % % %USAGE: scalar = p_spectrum_fast('string1','string2', p); (where p is the length of the substring) % % [scalar, matrix] = p_spectrum_fast('string1,'string2', p); % % %For more information, visit http://www.kernel-methods.net/ % %Written and tested in Matlab 6.0, Release 12. %Copyright 2003, Manju M. Pai 4/2003 %manju@kernel-methods.net %------------------------------------------------------------------------------------------ %Obtain lengths of strings [num_rows_s, n] = size(s); [num_rows_t, m] = size(t); %Initially set every matrix index to -1 to show value has not yet been found K = repmat(-1, [n, m]); %The main kernel G = repmat(-1, [n, m, p]); %The suffix kernel %Error checking statements: %Make sure input vectors are horizontal. if (num_rows_s ~= 1 | num_rows_t ~= 1) error('Error: s and t must be horizontal vectors.'); end; %If p is less than zero or not a number, program should quit due to faulty variable input. if p <= 0 | ischar(p) error('Error: p needs to be a number greater than 0.'); end; %End of error checking %Fill in the rest of the matrix using the function p_spectrum_fast_kernel() [K(n,m), G] = p_spectrum_fast_kernel(s, t, K, G, p); result = K(n,m); %------------------------------------------------------------------------------------------ function [ans, G] = p_spectrum_fast_kernel(sa, t, K, G, p) %This function is called by p_spectrum_fast(s,t,p). %Type 'help p_spectrum_fast' for a description of the program. % %------------------------------------------------------------------------------------------ %Obtain lengths of both strings n = length(sa); m = length(t); %truncate last character of string and obtain length of new string s = sa(1:n-1); length_s = length(s); %Start algorithm: % 1) Split main algorithm into two parts: % a) K(s,t) if (length(s) < p) | (length(t) < p) %This is a base case where 0 is returned if either string has length 0 ans = 0; elseif( K( length(s), length(t) ) == -1 ) % Value has not yet been calculated ans = p_spectrum_fast_kernel(s, t, K, G, p); else % Value has already been calculated ans = K( length(s), length(t) ); end; % b) Summation of G[p-1](s,t(1:i-1))[t(i) == a] for i = 1:(length(t) - p) %this is the letter (a) that was truncated off the string letter = sa(n); %We need this 'for' loop as a cursor that iterates through the t string. pos_array = find(t(1:(m)) == letter); %array which consists of all indices of t where t(i) == a for index = 1:length(pos_array) i = pos_array(index); length_t = length(t(1:(i-1))); if ( (p-1) == 0 ) result = 1; elseif (length_s == 0 | length_t == 0) %This is a base case where 0 is returned if either string has length 0 result = 0; elseif ( G( length_s, length_t, (p-1)) == -1 ) % Value has not yet been calculated [result, G] = suffix_kernel(s, t(1:(i-1)), G, (p-1)); else % Value has already been calculated result = G( length_s, length_t, (p-1)); end; ans = ans + result; end; return % End of algorithm %------------------------------------------------------------------------------------------ function [ans, G] = suffix_kernel(sa, tb, G, p) %This function is called by p_spectrum_fast(s,t,p). %Type 'help p_spectrum_fast' for a description of the program. % %------------------------------------------------------------------------------------------ %Obtain lengths of both strings n = length(sa); m = length(tb); %if last characters of both strings do not match, return 0 if ~(strcmpi( sa(n), tb(m) ) ) ans = 0; return end; %truncate last character of string s = sa(1:n-1); t = tb(1:m-1); %Obtain lengths of truncated strings length_s = length(s); length_t = length(t); %Start algorithm: G(sa,tb) = (1 + lambda^2)*G-1(s,t)[a==b] if ((p-1) == 0) %This is a base case where 1 is returned if G[p] = G[0] ans = 1; elseif (length_s == 0) | (length_t == 0) %This is a base case where 0 is returned if either string has length 0 ans = 0; elseif( G( length_s, length_t, (p-1) ) == -1 ) % Value has not yet been calculated [ans, G] = suffix_kernel(s, t, G, (p-1)); G( length_s, length_t, (p-1)) = ans; else % Value has already been calculated ans = G( length_s, length_t, (p-1) ); end; return