I. INTRODUCTION
1. What is Bloom filter?
Bloom filter is a binary vector data structure presented by Howard Bloom in 1970, which has good spatial and temporal efficiency, and is used to detect whether an element is a member of a set, and this test will only err on the data in the set. Instead of making a bad judgment on data that is not in the collection, each test request returns "within the set (possibly wrong)" and "Not in the collection (absolutely not in the set)", which shows that Bloom filter is sacrificing the correct rate in exchange for time and space.
How to calculate 2.bloom filter?
If you need to judge whether an element is in a set, we usually do is to save all the elements, and then through the comparison to know whether it is in the set, linked lists, trees are based on this idea, when the number of elements in the set to become larger, we need the space and time are linear increase, the speed of retrieval is also more and more slow. Bloom filter is a hash function that maps an element to a point on an M-length array, and when this point is 1 o'clock, the element is within the set and vice versa. The disadvantage of this method is that when there are many elements of the detection can be a conflict, the solution is to use K hash function corresponding to k points, if all points are 1, then the elements in the set, if there are 0, the elements are not in the collection.
3.bloom filter characteristics?
The advantage of Bloom filter is that its insertion and query time are constant, in addition to its query elements do not save the element itself, with good security. Its flaws are also obvious, the more elements you insert, the greater the probability that the error will be "in the set", and that Bloom filter cannot delete an element because the result of multiple element hashes may occupy the same bit in the Bloom filter structure, and if you delete a bit, May affect detection of multiple elements.
Two. Code implementation
Now under the Linux implementation of the Bloom filter function code:
Copy Code code as follows:
Bloom.h:
#ifndef __bloom_h__
#define __bloom_h__
#include <stdlib.h>
typedef unsigned INT (*hashfunc_t) (const char *);
typedef struct {
size_t Asize;
unsigned char *a;
size_t Nfuncs;
hashfunc_t *funcs;
BLOOM;
BLOOM *bloom_create (size_t size, size_t nfuncs, ...);
int Bloom_destroy (Bloom *bloom);
int Bloom_add (Bloom *bloom, const char *s);
int Bloom_check (Bloom *bloom, const char *s);
#endif
BLOOM.C:
#include <limits.h>
#include <stdarg.h>
#include "bloom.h"
#define SETBIT (A, N) (A[n/char_bit] |= (1<< (n%char_bit))
#define GETBIT (A, N) (A[n/char_bit] & (1<< (n%char_bit))
BLOOM *bloom_create (size_t size, size_t nfuncs, ...)
{
BLOOM *bloom;
Va_list l;
int n;
if (!) ( Bloom=malloc (sizeof (BLOOM))) return NULL;
if (!) ( Bloom->a=calloc ((size+char_bit-1)/char_bit, sizeof (CHAR))) {
Free (bloom);
return NULL;
}
if (!) ( Bloom->funcs= (hashfunc_t*) malloc (nfuncs*sizeof (hashfunc_t))) {
Free (bloom->a);
Free (bloom);
return NULL;
}
Va_start (L, Nfuncs);
For (n=0 n<nfuncs; ++n) {
Bloom->funcs[n]=va_arg (L, hashfunc_t);
}
Va_end (l);
bloom->nfuncs=nfuncs;
bloom->asize=size;
return bloom;
}
int Bloom_destroy (Bloom *bloom)
{
Free (bloom->a);
Free (BLOOM->FUNCS);
Free (bloom);
return 0;
}
int Bloom_add (Bloom *bloom, const char *s)
{
size_t N;
For (n=0 n<bloom->nfuncs; ++n) {
Setbit (Bloom->a, Bloom->funcs[n] (s)%bloom->asize);
}
return 0;
}
int Bloom_check (Bloom *bloom, const char *s)
{
size_t N;
For (n=0 n<bloom->nfuncs; ++n) {
if (!) ( Getbit (Bloom->a, Bloom->funcs[n] (s)%bloom->asize)) return 0;
}
return 1;
}
TEST.C:
#include <stdio.h>
#include <string.h>
#include "bloom.h"
Here are two hash algorithm functions
unsigned int sax_hash (const char *key)
{
unsigned int h=0;
while (*key) h^= (h<<5) + (h>>2) + (unsigned char) *key++;
return h;
}
unsigned int sdbm_hash (const char *key)
{
unsigned int h=0;
while (*key) h= (unsigned char) *key++ + (h<<6) + (h<<16)-H;
return h;
}
int main (int argc, char *argv[])
{
FILE *FP;
Char line[1024];
Char *p;
BLOOM *bloom;
if (argc<2) {
fprintf (stderr, "Error:no Word file specified\n");
return exit_failure;
}
if (!) ( Bloom=bloom_create (2500000, 2, Sax_hash, Sdbm_hash)) {
fprintf (stderr, "error:could not create Bloom filter\n");
return exit_failure;
}
if (!) ( Fp=fopen (Argv[1], "R"))) {
fprintf (stderr, "error:could not open File%s\n", argv[1]);
return exit_failure;
}
while (Fgets (line, 1024, FP)) {
if (P=STRCHR (line, ' \ R ')) *p= ';/Enter
if (P=STRCHR (line, ' \ n ')) *p= ' ";//Line Wrap
Bloom_add (Bloom, line);
}
Fclose (FP);
while (Fgets (line, 1024, stdin)) {
if (P=STRCHR (line, ' \ R ')) *p= ';
if (P=STRCHR (line, ' \ n ')) *p= ';
P=strtok (line, "\t,.;:\ r\n?! -/()");
while (p) {
if (!bloom_check (Bloom, p)) {
printf ("No match for Ford \"%s\ "\ n", p);
}
Else
printf ("Match for Ford \"%s\ "\ n", p);
P=strtok (NULL, "\t,.;:\ r\n?! -/()");
}
}
Bloom_destroy (Bloom);
return exit_success;
}
Makefile:
All:bloom
BLOOM:BLOOM.O TEST.O
Cc-o bloom-wall-pedantic BLOOM.O TEST.O
BLOOM.O:BLOOM.C bloom.h
Cc-o bloom.o-wall-pedantic-ansi-c BLOOM.C
Test.o:test.c bloom.h
Cc-o test.o-wall-pedantic-ansi-c test.c