001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.collections4.bloomfilter; 018 019/** 020 * The definition of a Bloom filter shape. 021 * 022 * <p> This class contains the values for the filter configuration and is used to 023 * convert a Hasher into a BloomFilter as well as verify that two Bloom filters are 024 * compatible. (i.e. can be compared or merged)</p> 025 * 026 * <h2>Interrelatedness of values</h2> 027 * 028 * <dl> 029 * <dt>Number of Items ({@code n})</dt> 030 * <dd>{@code n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))}</dd> 031 * <dt>Probability of False Positives ({@code p})</dt> 032 * <dd>{@code p = pow(1 - exp(-k / (m / n)), k)}</dd> 033 * <dt>Number of Bits ({@code m})</dt> 034 * <dd>{@code m = ceil((n * ln(p)) / ln(1 / pow(2, ln(2))))}</dd> 035 * <dt>Number of Functions ({@code k})</dt> 036 * <dd>{@code k = round((m / n) * ln(2))}</dd> 037 * </dl> 038 * 039 * <h2>Estimations from cardinality based on shape</h2> 040 * 041 * <p>Several estimates can be calculated from the Shape and the cardinality of a Bloom filter.</p> 042 * 043 * <p>In the calculation below the following values are used:</p> 044 * <ul> 045 * <li>double c = the cardinality of the Bloom filter.</li> 046 * <li>double m = numberOfBits as specified in the shape.</li> 047 * <li>double k = numberOfHashFunctions as specified in the shape.</li> 048 * </ul> 049 * 050 * <h3>Estimate N - n()</h3> 051 * 052 * <p>The calculation for the estimate of N is: {@code -(m/k) * ln(1 - (c/m))}. This is the calculation 053 * performed by the {@code Shape.estimateN(cardinality)} method below. This estimate is roughly equivalent to the 054 * number of hashers that have been merged into a filter to create the cardinality specified.</p> 055 * 056 * <p><em>Note:</em></p> 057 * <ul> 058 * <li>if cardinality == numberOfBits, then result is infinity.</li> 059 * <li>if cardinality > numberOfBits, then result is NaN.</li> 060 * </ul> 061 * 062 * <h3>Estimate N of Union - n(A ∪ B)</h3> 063 * 064 * <p>To estimate the number of items in the union of two Bloom filters with the same shape, merge them together and 065 * calculate the estimated N from the result.</p> 066 * 067 * <h3>Estimate N of the Intersection - n(A ∩ B)</h3> 068 * 069 * <p>To estimate the number of items in the intersection of two Bloom filters A and B with the same shape the calculation is: 070 * n(A) + n(b) - n(A ∪ B).</p> 071 * 072 * <p>Care must be taken when any of the n(x) returns infinity. In general the following assumptions are true: 073 * 074 * <ul> 075 * <li>If n(A) = ∞ and n(B) < ∞ then n(A ∩ B) = n(B)</li> 076 * <li>If n(A) < ∞ and n(B) = ∞ then n(A ∩ B) = n(A)</li> 077 * <li>If n(A) = ∞ and n(B) = ∞ then n(A ∩ B) = ∞</li> 078 * <li>If n(A) < ∞ and n(B) < ∞ and n(A ∪ B) = ∞ then n(A ∩ B) is undefined.</li> 079 * </ul> 080 * 081 * @see <a href="https://hur.st/bloomfilter">Bloom Filter calculator</a> 082 * @see <a href="https://en.wikipedia.org/wiki/Bloom_filter">Bloom filter 083 * [Wikipedia]</a> 084 * @since 4.5 085 */ 086public final class Shape { 087 088 /** 089 * The natural logarithm of 2. Used in several calculations. Approximately 0.693147180559945. 090 */ 091 private static final double LN_2 = Math.log(2.0); 092 093 /** 094 * ln(1 / 2^ln(2)). Used in calculating the number of bits. Approximately -0.480453013918201. 095 * 096 * <p>ln(1 / 2^ln(2)) = ln(1) - ln(2^ln(2)) = -ln(2) * ln(2) 097 */ 098 private static final double DENOMINATOR = -LN_2 * LN_2; 099 100 /** 101 * Calculates the number of hash functions given numberOfItems and numberOfBits. 102 * This is a method so that the calculation is consistent across all constructors. 103 * 104 * @param numberOfItems the number of items in the filter. 105 * @param numberOfBits the number of bits in the filter. 106 * @return the optimal number of hash functions. 107 * @throws IllegalArgumentException if the calculated number of hash function is {@code < 1} 108 */ 109 private static int calculateNumberOfHashFunctions(final int numberOfItems, final int numberOfBits) { 110 // k = round((m / n) * ln(2)) We change order so that we use real math rather 111 // than integer math. 112 final long k = Math.round(LN_2 * numberOfBits / numberOfItems); 113 if (k < 1) { 114 throw new IllegalArgumentException(String.format("Filter too small: Calculated number of hash functions (%s) was less than 1", k)); 115 } 116 // Normally we would check that numberOfHashFunctions <= Integer.MAX_VALUE but 117 // since numberOfBits is at most Integer.MAX_VALUE the numerator of 118 // numberOfHashFunctions is ln(2) * Integer.MAX_VALUE = 646456992.9449 the 119 // value of k can not be above Integer.MAX_VALUE. 120 return (int) k; 121 } 122 123 /** 124 * Check the calculated probability is {@code < 1.0}. 125 * 126 * <p>This function is used to verify that the dynamically calculated probability for the 127 * Shape is in the valid range 0 to 1 exclusive. This need only be performed once upon 128 * construction. 129 * 130 * @param probability the probability 131 * @throws IllegalArgumentException if the probability is {@code >= 1.0}. 132 */ 133 private static void checkCalculatedProbability(final double probability) { 134 // We do not need to check for p <= 0.0 since we only allow positive values for 135 // parameters and the closest we can come to exp(-kn/m) == 1 is 136 // exp(-1/Integer.MAX_INT) approx 0.9999999995343387 so Math.pow( x, y ) will 137 // always be 0<x<1 and y>0 138 if (probability >= 1.0) { 139 throw new IllegalArgumentException("Calculated probability is greater than or equal to 1: " + probability); 140 } 141 } 142 143 /** 144 * Check number of bits is strictly positive. 145 * 146 * @param numberOfBits the number of bits 147 * @return the number of bits 148 * @throws IllegalArgumentException if the number of bits is {@code < 1}. 149 */ 150 private static int checkNumberOfBits(final int numberOfBits) { 151 if (numberOfBits < 1) { 152 throw new IllegalArgumentException("Number of bits must be greater than 0: " + numberOfBits); 153 } 154 return numberOfBits; 155 } 156 157 /** 158 * Check number of hash functions is strictly positive. 159 * 160 * @param numberOfHashFunctions the number of hash functions 161 * @return the number of hash functions 162 * @throws IllegalArgumentException if the number of hash functions is {@code < 1}. 163 */ 164 private static int checkNumberOfHashFunctions(final int numberOfHashFunctions) { 165 if (numberOfHashFunctions < 1) { 166 throw new IllegalArgumentException("Number of hash functions must be greater than 0: " + numberOfHashFunctions); 167 } 168 return numberOfHashFunctions; 169 } 170 171 /** 172 * Check number of items is strictly positive. 173 * 174 * @param numberOfItems the number of items 175 * @return the number of items 176 * @throws IllegalArgumentException if the number of items is {@code < 1}. 177 */ 178 private static int checkNumberOfItems(final int numberOfItems) { 179 if (numberOfItems < 1) { 180 throw new IllegalArgumentException("Number of items must be greater than 0: " + numberOfItems); 181 } 182 return numberOfItems; 183 } 184 185 /** 186 * Check the probability is in the range 0.0, exclusive, to 1.0, exclusive. 187 * 188 * @param probability the probability 189 * @throws IllegalArgumentException if the probability is not in the range {@code (0, 1)} 190 */ 191 private static void checkProbability(final double probability) { 192 // Using the negation of within the desired range will catch NaN 193 if (!(probability > 0.0 && probability < 1.0)) { 194 throw new IllegalArgumentException("Probability must be greater than 0 and less than 1: " + probability); 195 } 196 } 197 198 /** 199 * Constructs a filter configuration with the specified number of hashFunctions ({@code k}) and 200 * bits ({@code m}). 201 * 202 * @param numberOfHashFunctions Number of hash functions to use for each item placed in the filter. 203 * @param numberOfBits The number of bits in the filter 204 * @return a valid Shape. 205 * @throws IllegalArgumentException if {@code numberOfHashFunctions < 1} or {@code numberOfBits < 1} 206 */ 207 public static Shape fromKM(final int numberOfHashFunctions, final int numberOfBits) { 208 return new Shape(numberOfHashFunctions, numberOfBits); 209 } 210 211 /** 212 * Constructs a filter configuration with the specified number of items ({@code n}) and 213 * bits ({@code m}). 214 * 215 * <p>The optimal number of hash functions ({@code k}) is computed. 216 * <pre>k = round((m / n) * ln(2))</pre> 217 * 218 * <p>The false-positive probability is computed using the number of items, bits and hash 219 * functions. An exception is raised if this is greater than or equal to 1 (i.e. the 220 * shape is invalid for use as a Bloom filter). 221 * 222 * @param numberOfItems Number of items to be placed in the filter 223 * @param numberOfBits The number of bits in the filter 224 * @return a valid Shape. 225 * @throws IllegalArgumentException if {@code numberOfItems < 1}, {@code numberOfBits < 1}, 226 * the calculated number of hash function is {@code < 1}, or if the actual probability is {@code >= 1.0} 227 */ 228 public static Shape fromNM(final int numberOfItems, final int numberOfBits) { 229 checkNumberOfItems(numberOfItems); 230 checkNumberOfBits(numberOfBits); 231 final int numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits); 232 final Shape shape = new Shape(numberOfHashFunctions, numberOfBits); 233 // check that probability is within range 234 checkCalculatedProbability(shape.getProbability(numberOfItems)); 235 return shape; 236 } 237 238 /** 239 * Constructs a filter configuration with the specified number of items, bits 240 * and hash functions. 241 * 242 * <p>The false-positive probability is computed using the number of items, bits and hash 243 * functions. An exception is raised if this is greater than or equal to 1 (i.e. the 244 * shape is invalid for use as a Bloom filter). 245 * 246 * @param numberOfItems Number of items to be placed in the filter 247 * @param numberOfBits The number of bits in the filter. 248 * @param numberOfHashFunctions The number of hash functions in the filter 249 * @return a valid Shape. 250 * @throws IllegalArgumentException if {@code numberOfItems < 1}, {@code numberOfBits < 1}, 251 * {@code numberOfHashFunctions < 1}, or if the actual probability is {@code >= 1.0}. 252 */ 253 public static Shape fromNMK(final int numberOfItems, final int numberOfBits, final int numberOfHashFunctions) { 254 checkNumberOfItems(numberOfItems); 255 checkNumberOfBits(numberOfBits); 256 checkNumberOfHashFunctions(numberOfHashFunctions); 257 // check that probability is within range 258 final Shape shape = new Shape(numberOfHashFunctions, numberOfBits); 259 // check that probability is within range 260 checkCalculatedProbability(shape.getProbability(numberOfItems)); 261 return shape; 262 } 263 264 /** 265 * Constructs a filter configuration with the specified number of items ({@code n}) and 266 * desired false-positive probability ({@code p}). 267 * 268 * <p>The number of bits ({@code m}) for the filter is computed. 269 * <pre>m = ceil(n * ln(p) / ln(1 / 2^ln(2)))</pre> 270 * 271 * <p>The optimal number of hash functions ({@code k}) is computed. 272 * <pre>k = round((m / n) * ln(2))</pre> 273 * 274 * <p>The actual probability will be approximately equal to the 275 * desired probability but will be dependent upon the calculated number of bits and hash 276 * functions. An exception is raised if this is greater than or equal to 1 (i.e. the 277 * shape is invalid for use as a Bloom filter). 278 * 279 * @param numberOfItems Number of items to be placed in the filter 280 * @param probability The desired false-positive probability in the range {@code (0, 1)} 281 * @return a valid Shape 282 * @throws IllegalArgumentException if {@code numberOfItems < 1}, if the desired probability 283 * is not in the range {@code (0, 1)} or if the actual probability is {@code >= 1.0}. 284 */ 285 public static Shape fromNP(final int numberOfItems, final double probability) { 286 checkNumberOfItems(numberOfItems); 287 checkProbability(probability); 288 289 // Number of bits (m) 290 final double m = Math.ceil(numberOfItems * Math.log(probability) / DENOMINATOR); 291 if (m > Integer.MAX_VALUE) { 292 throw new IllegalArgumentException("Resulting filter has more than " + Integer.MAX_VALUE + " bits: " + m); 293 } 294 final int numberOfBits = (int) m; 295 296 final int numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits); 297 final Shape shape = new Shape(numberOfHashFunctions, numberOfBits); 298 // check that probability is within range 299 checkCalculatedProbability(shape.getProbability(numberOfItems)); 300 return shape; 301 } 302 303 /** 304 * Constructs a filter configuration with a desired false-positive probability ({@code p}) and the 305 * specified number of bits ({@code m}) and hash functions ({@code k}). 306 * 307 * <p>The number of items ({@code n}) to be stored in the filter is computed. 308 * <pre>n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))</pre> 309 * 310 * <p>The actual probability will be approximately equal to the 311 * desired probability but will be dependent upon the calculated Bloom filter capacity 312 * (number of items). An exception is raised if this is greater than or equal to 1 (i.e. the 313 * shape is invalid for use as a Bloom filter). 314 * 315 * @param probability The desired false-positive probability in the range {@code (0, 1)} 316 * @param numberOfBits The number of bits in the filter 317 * @param numberOfHashFunctions The number of hash functions in the filter 318 * @return a valid Shape. 319 * @throws IllegalArgumentException if the desired probability is not in the range {@code (0, 1)}, 320 * {@code numberOfBits < 1}, {@code numberOfHashFunctions < 1}, or the actual 321 * probability is {@code >= 1.0} 322 */ 323 public static Shape fromPMK(final double probability, final int numberOfBits, final int numberOfHashFunctions) { 324 checkProbability(probability); 325 checkNumberOfBits(numberOfBits); 326 checkNumberOfHashFunctions(numberOfHashFunctions); 327 328 // Number of items (n): 329 // n = ceil(m / (-k / ln(1 - exp(ln(p) / k)))) 330 final double n = Math.ceil(numberOfBits / (-numberOfHashFunctions / Math.log(-Math.expm1(Math.log(probability) / numberOfHashFunctions)))); 331 332 // log of probability is always < 0 333 // number of hash functions is >= 1 334 // e^x where x < 0 = [0,1) 335 // log 1-e^x = [log1, log0) = <0 with an effective lower limit of -53 336 // numberOfBits/ (-numberOfHashFunctions / [-53,0) ) >0 337 // ceil( >0 ) >= 1 338 // so we can not produce a negative value thus we don't check for it. 339 // 340 // similarly we can not produce a number greater than numberOfBits so we 341 // do not have to check for Integer.MAX_VALUE either. 342 343 final Shape shape = new Shape(numberOfHashFunctions, numberOfBits); 344 // check that probability is within range 345 checkCalculatedProbability(shape.getProbability((int) n)); 346 return shape; 347 } 348 349 /** 350 * Number of hash functions to create a filter ({@code k}). 351 */ 352 private final int numberOfHashFunctions; 353 354 /** 355 * Number of bits in the filter ({@code m}). 356 */ 357 private final int numberOfBits; 358 359 /** 360 * Constructs a filter configuration with the specified number of hashFunctions ({@code k}) and 361 * bits ({@code m}). 362 * 363 * @param numberOfHashFunctions Number of hash functions to use for each item placed in the filter. 364 * @param numberOfBits The number of bits in the filter 365 * @throws IllegalArgumentException if {@code numberOfHashFunctions < 1} or {@code numberOfBits < 1} 366 */ 367 private Shape(final int numberOfHashFunctions, final int numberOfBits) { 368 this.numberOfHashFunctions = checkNumberOfHashFunctions(numberOfHashFunctions); 369 this.numberOfBits = checkNumberOfBits(numberOfBits); 370 } 371 372 @Override 373 public boolean equals(final Object obj) { 374 // Shape is final so no check for the same class as inheritance is not possible 375 if (obj instanceof Shape) { 376 final Shape other = (Shape) obj; 377 return numberOfBits == other.numberOfBits && numberOfHashFunctions == other.numberOfHashFunctions; 378 } 379 return false; 380 } 381 382 /** 383 * Estimates the maximum number of elements that can be merged into a filter of 384 * this shape before the false positive rate exceeds the desired rate. <p> The 385 * formula for deriving {@code k} when {@code m} and {@code n} are known is: 386 * 387 * <p>{@code k = ln2 * m / n}</p> 388 * 389 * <p>Solving for {@code n} yields:</p> 390 * 391 * <p>{@code n = ln2 * m / k}</p> 392 * 393 * @return An estimate of max N. 394 */ 395 public double estimateMaxN() { 396 return numberOfBits * LN_2 / numberOfHashFunctions; 397 } 398 399 /** 400 * Estimate the number of items in a Bloom filter with this shape and the specified number of bits enabled. 401 * 402 * <p><em>Note:</em></p> 403 * <ul> 404 * <li> if cardinality == numberOfBits, then result is infinity.</li> 405 * <li> if cardinality > numberOfBits, then result is NaN.</li> 406 * </ul> 407 * 408 * @param cardinality the number of enabled bits also known as the hamming value. 409 * @return An estimate of the number of items in the Bloom filter. 410 */ 411 public double estimateN(final int cardinality) { 412 final double c = cardinality; 413 final double m = numberOfBits; 414 final double k = numberOfHashFunctions; 415 return -(m / k) * Math.log1p(-c / m); 416 } 417 418 /** 419 * Gets the number of bits in the Bloom filter. 420 * This is also known as {@code m}. 421 * 422 * @return the number of bits in the Bloom filter ({@code m}). 423 */ 424 public int getNumberOfBits() { 425 return numberOfBits; 426 } 427 428 /** 429 * Gets the number of hash functions used to construct the filter. 430 * This is also known as {@code k}. 431 * 432 * @return the number of hash functions used to construct the filter ({@code k}). 433 */ 434 public int getNumberOfHashFunctions() { 435 return numberOfHashFunctions; 436 } 437 438 /** 439 * Calculates the probability of false positives ({@code p}) given 440 * numberOfItems ({@code n}), numberOfBits ({@code m}) and numberOfHashFunctions ({@code k}). 441 * <pre>p = pow(1 - exp(-k / (m / n)), k)</pre> 442 * 443 * <p>This is the probability that a Bloom filter will return true for the presence of an item 444 * when it does not contain the item.</p> 445 * 446 * <p>The probability assumes that the Bloom filter is filled with the expected number of 447 * items. If the filter contains fewer items then the actual probability will be lower. 448 * Thus, this returns the worst-case false positive probability for a filter that has not 449 * exceeded its expected number of items.</p> 450 * 451 * @param numberOfItems the number of items hashed into the Bloom filter. 452 * @return the probability of false positives. 453 */ 454 public double getProbability(final int numberOfItems) { 455 if (numberOfItems < 0) { 456 throw new IllegalArgumentException("Number of items must be greater than or equal to 0: " + numberOfItems); 457 } 458 if (numberOfItems == 0) { 459 return 0; 460 } 461 return Math.pow(-Math.expm1(-1.0 * numberOfHashFunctions * numberOfItems / numberOfBits), numberOfHashFunctions); 462 } 463 464 @Override 465 public int hashCode() { 466 // Match Arrays.hashCode(new int[] {numberOfBits, numberOfHashFunctions}) 467 return (31 + numberOfBits) * 31 + numberOfHashFunctions; 468 } 469 470 /** 471 * Determines if a cardinality is sparse based on the shape. 472 * <p>This method assumes that bit maps are 64bits and indexes are 32bits. If the memory 473 * necessary to store the cardinality as indexes is less than the estimated memory for bit maps, 474 * the cardinality is determined to be {@code sparse}.</p> 475 * @param cardinality the cardinality to check. 476 * @return true if the cardinality is sparse within the shape. 477 */ 478 public boolean isSparse(final int cardinality) { 479 /* 480 * Since the size of a bit map is a long and the size of an index is an int, 481 * there can be 2 indexes for each bit map. In Bloom filters indexes are evenly 482 * distributed across the range of possible values, Thus if the cardinality 483 * (number of indexes) is less than or equal to 2*number of bit maps the 484 * cardinality is sparse within the shape. 485 */ 486 return cardinality <= BitMaps.numberOfBitMaps(getNumberOfBits()) * 2; 487 } 488 489 @Override 490 public String toString() { 491 return String.format("Shape[k=%s m=%s]", numberOfHashFunctions, numberOfBits); 492 } 493}