A Bloom filter, conceived by Burton Howard Bloom in 1970,[1] is a space-efficient probabilistic data structure that is used to test whether an element is a member of a set. False positive retrieval results are possible, but false negatives are not; i.e. a query returns either “inside set (may be wrong)” or “definitely not in set”. Read more about it on http://en.wikipedia.org/wiki/Bloom_filter

//http://en.literateprograms.org/Hash_function_comparison_%28C%2C_sh%29 module BloomFilter = let SDBMHash (v:string) : uint32 = let mutable h = 0u for i = 0 to v.Length - 1 do h <- ((uint32)(v.Chars (i)))+(h <<< 6)+(h <<< 16)-h h // SAX: http://www.eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx let SAXHash (v: string): uint32 = let mutable h = 0u for i = 0 to v.Length - 1 do h <- h ^^^ (h <<< 5) + (h >>> 2) + (uint32)(v.Chars (i)) h let GetHash (v:string):(uint32 * uint32) = (SAXHash v , SDBMHash v) // double hashing: http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf Section 5.1 type Bloom(filterSize: float32) = let blockSize = (int32) 100000000 //100 MB Block let p = 0.001f //false positive tolerance let l2 = float32 (log 2.0) //From http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives let m = (-filterSize * log p) / float32 (l2 * l2) let mEff = float32 2.0 ** ceil(float32 (log m) / l2) let b = mEff / 8.0F (* for constant 50 Collissions %:14.907256 Existing Check- SucccessCtr % : 100 Non existing Check - SucccessCtr % : 99 changing this to 30 drop the accuracy to 21% *) let numBlocks = (int)(b / float32 blockSize) + 50 //Magical constant to increase numBlocks to increase the accuracy of BF, for 100 Million objects it is 99% accurate. let XAxis (v:uint32) :uint32 = v % (uint32) numBlocks let YAxis (v:uint32) :uint32 = v % (uint32) blockSize let mutable collisionCounter = 0u let mutable filterBits = [| for i in 0u ..(uint32)numBlocks -> Array.zeroCreate blockSize |] let Check (a,b) = let byteVal = (byte) (1 <<< (int)(a % 8u)) if (((filterBits.[int (XAxis a)]).[int (YAxis a)] &&& byteVal ) = 0uy ) then true else let byteVal = (byte)(1 <<< (int)(b % 8u)) if ((filterBits.[int (XAxis b)]).[int (YAxis b)] &&& byteVal = 0uy) then true //Certainly not in Bloom filter 100% sure else false //may be :( let Insert (v: uint32) = let mutable bCollision = false let mutable o = (filterBits.[int(XAxis v)]).[int(YAxis v)] if ((byte) o) <> 0uy then bCollision <- true let byteVal = (byte) (1 <<< (int)(v % 8u)) o <- o ||| byteVal (filterBits.[int (XAxis v)]).SetValue (o, int(YAxis v)) bCollision member this.Collisions = collisionCounter member this.AddValue (v:string) = let a,b = GetHash v if (Insert a && Insert b) then collisionCounter <- collisionCounter + 1u ///<summary> return true, definatly not in Bloom filter. returns false for may be </summary> member this.CheckValue (v:string) = GetHash v |> Check member this.BloomSize = blockSize * numBlocks

Advertisements