BigSnarf blog

Infosec FTW

T-Digest Algebird

https://github.com/twitter/algebird/compare/develop…erikerlandson:feature/tdigest

https://github.com/CamDavidsonPilon/tdigest

http://koff.io/posts/using-t-digest/

https://github.com/fluxcapacitor/pipeline/tree/master/myapps/spark/streaming/src/main/scala/com/advancedspark/streaming/rating/approx

Example of T-Digest plus method with Algebird Semigroup


/**
* Example of T-Digest plus method with Algebird Semigroup
*/
import com.tdunning.math.stats.TDigest
import com.twitter.algebird.{Group, Semigroup}
import io.koff.t_digest._
case object TDigestSemigroup extends Semigroup[TDigest] {
override def plus(l: TDigest, r: TDigest): TDigest = {
val td = TDigest.createDigest(math.max(l.compression(), r.compression()))
td.add(l)
td.add(r)
td
}
override def sumOption(iter: TraversableOnce[TDigest]): Option[TDigest] = {
iter.foldLeft(None: Option[TDigest]) {
case (None, el) =>
val td = TDigest.createDigest(el.compression())
td.add(el)
Some(td)
case (f@Some(acc), el) =>
acc.add(el)
f
}
}
}
val oneSecond = 1000
val twoMinutes = 2 * 60 * 1000
val tenMinutes = 10 * 60 * 1000
val twoHours = 2 * 60 * 60 * 1000
val mainValues = 10000000
val badValues = 10000
//generate 10.000.000 pseudo-random values for normal user session durations
val goodData = Generator.generate(count = mainValues, from = oneSecond, to = twoMinutes)
//generate 100.000(1%) pseudo-random values for invalid user session durations
val badData = Generator.generate(count = badValues, from = tenMinutes, to = twoHours)
val allData = goodData ++ badData
val goodDigest = TDigest.createAvlTreeDigest(100)
val badDigest = TDigest.createAvlTreeDigest(100)
val allDigest = TDigest.createAvlTreeDigest(100)
//val goodDigest = TDigest.createTreeDigest(100)
//val badDigest = TDigest.createTreeDigest(100)
//val allDigest = TDigest.createTreeDigest(100)
// add good data values to good digest
goodData.foreach(value => goodDigest.add(value))
// add bad data values to bad digest
badData.foreach(value => badDigest.add(value))
// add bad data values to bad digest
allData.foreach(value => allDigest.add(value))
//this threshold means that we expect ~0.1% of data is anomalies
val thresholdAllDigest = allDigest.quantile(0.999d).toInt
val tds = TDigestSemigroup
val plusDigest = tds.plus(goodDigest, badDigest)
val thresholdPlusDigest = plusDigest.quantile(0.999d).toInt
val thresholdGoodDigest = goodDigest.quantile(0.999d).toInt
val thresholdBadDigest = badDigest.quantile(0.999d).toInt

view raw

t-digetst.scala

hosted with ❤ by GitHub

http://erikerlandson.github.io/blog/2015/09/26/a-library-of-binary-tree-algorithms-as-mixable-scala-traits/

https://github.com/tdunning/t-digest/tree/master/src/main/java/com/tdunning/math/stats

 

https://github.com/HdrHistogram/HdrHistogram

http://www.ebaytechblog.com/2015/08/19/statistical-anomaly-detection/

 

Click to access histo.pdf

Click to access histo.pdf

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google photo

You are commenting using your Google account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

%d bloggers like this: