Write the algorithm step by step (to search for strings)

Source: Internet
Author: User

 

[Disclaimer: All Rights Reserved. You are welcome to reprint it. Do not use it for commercial purposes. Contact Email: feixiaoxing @ 163.com]

 


We talked about the KMP algorithm earlier, but it is not very detailed. Today, we can talk about this question in a little more detail. Assume that string B is searched for in string A. The length of string B is n, and the length of string A is much greater than n. Therefore, we ignore it first.

Assume that the query in string A is started, and the two sides find an error at the p character, that is, the following situation:


/*
* A: A1 A2 A3 A4... Ap ............
* B: B1 B2 B3 B4... Bp... Bn
* (P)
*/
/*
* A: A1 A2 A3 A4... Ap ............
* B: B1 B2 B3 B4... Bp... Bn
* (P)
*/

So what is the choice of A at this time? It can be moved to the left by 1 bit, with A2 ~ Comparison between B1 and B1 ~ B (P-2), and then use A (p )~ A (n + 1) compares B (PM )~ B (n) bits; or 2 bits left, A3 ~ Comparison between B1 and B1 ~ B (P-3), and then use A (p )~ A (n + 2) compares B (P-2 )~ B (n) bits, and so on until the Left shift (P-2) bits, compare B (1) with A (P-1), and then use A (p )~ A (p + N-2) Comparison B (2 )~ B (n) bits.

I don't know what rules do my careful friends find? Because A and B have the same data in the first (PM), the above calculation can actually look like this: Using A2 ~ Comparison between B1 and B1 ~ B (P-2) is actually B2 ~ B (p-1) compares B1 ~ B (P-2); A3 ~ Comparison between B1 and B1 ~ B (P-3) is actually B3 ~ B (p-1) compares B1 ~ B (P-3); until B (p) and B (1) are compared. Since all the data is B's own data, we can calculate the results in advance.

So how many places should we shift left for so many choices?

In fact, the judgment is very simple. Suppose we move 1 to the left and find A2 ~ Results of A (p-1) and B1 ~ B (P-2) is the same, so the two can be compared directly starting from the second (p-1). If it fails, it can only shift two places left and Judge A2 ~ A (PM) and B1 ~ B (P-2) comparison result..., and so on. If we find that none of the data is successful, we can only start from scratch and compare the data from 1st bits.

If you do not know the details, you can check the following code:


Int calculate_for_special_index (char str [], int index)
{
Int loop;
Int value;

Value = 0;
For (loop = 1; loop <index; loop ++ ){
If (! Strncmp (& str [loop], str, (index-loop ))){
Value = index-loop;
Break;
}
}

Return (value = 0 )? 1: (index-value );
}
 
Void calculate_for_max_positon (char str [], int len, int data [])
{
Int index;

For (index = 0; index <len; index ++)
Data [index] = calculate_for_special_index (str, index );
}
Int calculate_for_special_index (char str [], int index)
{
Int loop;
Int value;
 
Value = 0;
For (loop = 1; loop <index; loop ++ ){
If (! Strncmp (& str [loop], str, (index-loop ))){
Value = index-loop;
Break;
}
}
 
Return (value = 0 )? 1: (index-value );
}

Void calculate_for_max_positon (char str [], int len, int data [])
{
Int index;
 
For (index = 0; index <len; index ++)
Data [index] = calculate_for_special_index (str, index );
}

Of course, the above is to calculate the number of characters that should be moved to the left when the index n comparison fails.


Char * strstr_kmp (const char * str, char * data)
{
Int index;
Int len;
Int value;
Int * pData;
 
If (NULL = str | NULL = str)
Return NULL;
 
Len = strlen (data );
PData = (int *) malloc (len * sizeof (int ));
Memset (pData, 0, len * sizeof (int ));
Calculate_for_max_positon (char *) str, len, pData );
 
Index = 0;
While (* str & (int) strlen (str)> = len )){
For (; index <len; index ++ ){
If (str [index]! = Data [index])
Break;
}

If (index = len ){
Free (pData );
Return (char *) str;
}

Value = pData [index];
Str + = value;
 
If (value = 1)
Index = 0;
Else
Index = index-value;
}

Free (pData );
Return NULL;
}
Char * strstr_kmp (const char * str, char * data)
{
Int index;
Int len;
Int value;
Int * pData;

If (NULL = str | NULL = str)
Return NULL;

Len = strlen (data );
PData = (int *) malloc (len * sizeof (int ));
Memset (pData, 0, len * sizeof (int ));
Calculate_for_max_positon (char *) str, len, pData );

Index = 0;
While (* str & (int) strlen (str)> = len )){
For (; index <len; index ++ ){
If (str [index]! = Data [index])
Break;
}

If (index = len ){
Free (pData );
Return (char *) str;
}
 
Value = pData [index];
Str + = value;

If (value = 1)
Index = 0;
Else
Index = index-value;
}
 
Free (pData );
Return NULL;
}
Maybe my friends saw it. The strlen above is back again? The Code itself has room for optimization. You can give it a try first.


Int check_valid_for_kmp (char str [], int start, int len)
{
Int index;
 
For (index = start; index <len; index ++)
If ('\ 0' = str [index])
Return 0;
Return 1;
}
 
Char * strstr_kmp (const char * str, char * data)
{
Int index;
Int len;
Int value;
Int * pData;
 
If (NULL = str | NULL = str)
Return NULL;
 
Len = strlen (data );
PData = (int *) malloc (len * sizeof (int ));
Memset (pData, 0, len * sizeof (int ));
Calculate_for_max_positon (char *) str, len, pData );
 
Index = 0;
While (* str & check_valid_for_kmp (char *) str, index, len )){
For (; index <len; index ++ ){
If (str [index]! = Data [index])
Break;
}

If (index = len ){
Free (pData );
Return (char *) str;
}

Value = pData [index];
Str + = value;
 
If (value = 1)
Index = 0;
Else
Index = index-value;
}

Free (pData );
Return NULL;
}
Int check_valid_for_kmp (char str [], int start, int len)
{
Int index;

For (index = start; index <len; index ++)
If ('\ 0' = str [index])
Return 0;
Return 1;
}

Char * strstr_kmp (const char * str, char * data)
{
Int index;
Int len;
Int value;
Int * pData;

If (NULL = str | NULL = str)
Return NULL;

Len = strlen (data );
PData = (int *) malloc (len * sizeof (int ));
Memset (pData, 0, len * sizeof (int ));
Calculate_for_max_positon (char *) str, len, pData );

Index = 0;
While (* str & check_valid_for_kmp (char *) str, index, len )){
For (; index <len; index ++ ){
If (str [index]! = Data [index])
Break;
}

If (index = len ){
Free (pData );
Return (char *) str;
}
 
Value = pData [index];
Str + = value;

If (value = 1)
Index = 0;
Else
Index = index-value;
}
 
Free (pData );
Return NULL;
}

(3) multi-core search

In fact, the multi-core search is not new, that is, the search is divided into multiple copies, and different search processes are completed on different cores. For example, the cpu we use is generally a dual-core cpu, so we can divide the characters to be searched into two copies, so that the two searches can be performed simultaneously on the two cores. How to do this is actually not complicated. First, we need to define a Data Structure:


Typedef struct _ STRING_PART
{
Char * str;
Int len;
} STRING_PART;
Typedef struct _ STRING_PART
{
Char * str;
Int len;
} STRING_PART; next, we need to divide the string into two equal points to calculate the start address and length respectively.


Void set_value_for_string_part (char str [], int len, STRING_PART part [])
{
Char * middle = str + (len> 1 );
 
While (''! = * Middle)
Middle --;
 
Part [0]. str = str;
Part [0]. len = middle-(str-1 );
 
Part [1]. str = middle + 1;
Part [1]. len = len-(middle-(str-1 ));
}
Void set_value_for_string_part (char str [], int len, STRING_PART part [])
{
Char * middle = str + (len> 1 );

While (''! = * Middle)
Middle --;

Part [0]. str = str;
Part [0]. len = middle-(str-1 );

Part [1]. str = middle + 1;
Part [1]. len = len-(middle-(str-1 ));
} After dividing, you can start parallel operations.


Char * strstr_omp (char str [], char data [])
{
Int index;
STRING_PART part [2] = {0 };
Char * result [2] = {0 };
Int len = strlen (str );
 
Set_value_for_string_part (str, len, part );
 
# Pragma omp parellel
For (index = 0; index <2; index ++)
Result [index] = strstr (part [index]. str, part [index]. len, data );
 
If (NULL = result [0] & NULL = result [1])
Return NULL;
 
Return (NULL! = Result [0])? Result [0]: result [1];
}
Char * strstr_omp (char str [], char data [])
{
Int index;
STRING_PART part [2] = {0 };
Char * result [2] = {0 };
Int len = strlen (str );

Set_value_for_string_part (str, len, part );

# Pragma omp parellel
For (index = 0; index <2; index ++)
Result [index] = strstr (part [index]. str, part [index]. len, data );

If (NULL = result [0] & NULL = result [1])
Return NULL;

Return (NULL! = Result [0])? Result [0]: result [1];
} Note:

(1) Here, The omp macro can only run on VS2005 or a later version. At the same time, you need to add the header file # include <omp. h> to turn on the openmp switch;

(2) The strstr function is called here. The 2nd parameters are the length of the target string. They are slightly different from the common search function described earlier. The previous function cannot be used directly, but make a slight change.

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.