BigSnarf blog

Infosec FTW

Getting rid of chart junk in matplotlib – IPython notebook

Great python video – process billions of rows like big data system with just 1 AWS box

Python, Twitter, and “I hate my parents” – Christmas Meme?

Twitter Stream

Screen Shot 2013-12-25 at 10.15.18 AM

Python

Screen Shot 2013-12-25 at 10.19.06 AM

Results

Screen Shot 2013-12-25 at 10.18.47 AM

Screen Shot 2013-12-25 at 10.24.24 AM

 

 

Code

 

from twython import TwythonStreamer
class MyStreamer(TwythonStreamer):
 def on_success(self, data):
 if 'text' in data:
 print data['text'].encode('utf-8')
 def on_error(self, status_code, data):
 print status_code, data
stream = MyStreamer(APP_KEY, APP_SECRET,
 OAUTH_TOKEN, OAUTH_TOKEN_SECRET)
# Tracking Twitter search term
stream.statuses.filter(track='iphone')

Arduino Sensor, Python, and Google Analytics

_DSC0333

Screen Shot 2013-12-21 at 8.14.56 PM

Screen Shot 2013-12-21 at 8.19.16 PM

import serial
import time
import urllib2
import urllib
import httplib
ser = serial.Serial('/dev/tty.usbserial-AM01VDMD')
print ( "connected to: " + ser.portstr )
buf = []
while True:
 for line in ser.read():
 buf.append(line)
 if line == "\n":
 result = "".join(buf).strip()
 print result
connection = httplib.HTTPConnection('www.google-analytics.com')
 params = urllib.urlencode({
 'v': 1,
 'tid': 'UA-46669546-1',
 'cid': '555',
 't': 'event',
 'ec': 'arduino',
 'ea': 'ldr',
 'ev': result
 })
 connection.request('POST', '/collect', params)
 print "Posted to GA"
 print params
 buf=[]
ser.close()
"""
const int ledPin = 13;
const int sensorPin = 0;
void setup() {
 pinMode(ledPin, OUTPUT);
 Serial.begin(9600);
}
void loop() {
 int rate = analogRead(A0);
 digitalWrite(ledPin, HIGH); 
 delay(rate); 

 digitalWrite(ledPin, LOW); 
 delay(rate);

 Serial.println(rate);
 delay(500); //slow down the output for easier reading
}
"""

Motitvation http://www.forbes.com/sites/ericsavitz/2013/01/14/ces-2013-the-break-out-year-for-the-internet-of-things/

Itertools Recipes – Python Docs – So helpful

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

def enumerate(iterable, start=0):
    return izip(count(start), iterable)

def tabulate(function, start=0):
    "Return function(0), function(1), ..."
    return imap(function, count(start))

def consume(iterator, n):
    "Advance the iterator n-steps ahead. If n is none, consume entirely."
    # The technique uses objects that consume iterators at C speed.
    if n is None:
        # feed the entire iterator into a zero-length deque
        collections.deque(iterator, maxlen=0)
    else:
        # advance to the emtpy slice starting at position n
        next(islice(iterator, n, n), None)

def nth(iterable, n, default=None):
    "Returns the nth item or a default value"
    return next(islice(iterable, n, None), default)

def quantify(iterable, pred=bool):
    "Count how many times the predicate is true"
    return sum(imap(pred, iterable))

def padnone(iterable):
    """Returns the sequence elements and then returns None indefinitely.

    Useful for emulating the behavior of the built-in map() function.
    """
    return chain(iterable, repeat(None))

def ncycles(iterable, n):
    "Returns the sequence elements n times"
    return chain.from_iterable(repeat(iterable, n))

def dotproduct(vec1, vec2):
    return sum(imap(operator.mul, vec1, vec2))

def flatten(listOfLists):
    return list(chain.from_iterable(listOfLists))

def repeatfunc(func, times=None, *args):
    """Repeat calls to func with specified arguments.

    Example:  repeatfunc(random.random)
    """
    if times is None:
        return starmap(func, repeat(args))
    return starmap(func, repeat(args, times))

def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = tee(iterable)
    next(b, None)
    return izip(a, b)

def grouper(n, iterable, fillvalue=None):
    "grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx"
    args = [iter(iterable)] * n
    return izip_longest(fillvalue=fillvalue, *args)

def roundrobin(*iterables):
    "roundrobin('ABC', 'D', 'EF') --> A D E B F C"
    # Recipe credited to George Sakkis
    pending = len(iterables)
    nexts = cycle(iter(it).next for it in iterables)
    while pending:
        try:
            for next in nexts:
                yield next()
        except StopIteration:
            pending -= 1
            nexts = cycle(islice(nexts, pending))

def compress(data, selectors):
    "compress('ABCDEF', [1,0,1,0,1,1]) --> A C E F"
    return (d for d, s in izip(data, selectors) if s)

def combinations_with_replacement(iterable, r):
    "combinations_with_replacement('ABC', 2) --> AA AB AC BB BC CC"
    # number items returned:  (n+r-1)! / r! / (n-1)!
    pool = tuple(iterable)
    n = len(pool)
    if not n and r:
        return
    indices = [0] * r
    yield tuple(pool[i] for i in indices)
    while True:
        for i in reversed(range(r)):
            if indices[i] != n - 1:
                break
        else:
            return
        indices[i:] = [indices[i] + 1] * (r - i)
        yield tuple(pool[i] for i in indices)

def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))

def unique_everseen(iterable, key=None):
    "List unique elements, preserving order. Remember all elements ever seen."
    # unique_everseen('AAAABBBCCDAABBB') --> A B C D
    # unique_everseen('ABBCcAD', str.lower) --> A B C D
    seen = set()
    seen_add = seen.add
    if key is None:
        for element in iterable:
            if element not in seen:
                seen_add(element)
                yield element
    else:
        for element in iterable:
            k = key(element)
            if k not in seen:
                seen_add(k)
                yield element

def unique_justseen(iterable, key=None):
    "List unique elements, preserving order. Remember only the element just seen."
    # unique_justseen('AAAABBBCCDAABBB') --> A B C D A B
    # unique_justseen('ABBCcAD', str.lower) --> A B C A D
    return imap(next, imap(itemgetter(1), groupby(iterable, key)))

Data Science BookShelf

Screen Shot 2013-11-04 at 3.31.12 PM

Math

  • Linear Algebra and Its Applications by Gilbert Strang (Cengage Learning)
  • Convex Optimization by Stephen Boyd and Lieven Venden‐berghe (Cambridge University Press)
  • A First Course in Probability (Pearson) and Introduction to Probability Models (Academic Press) by Sheldon Ross

Coding

  • R in a Nutshell by Joseph Adler (O’Reilly)
  • Learning Python by Mark Lutz and David Ascher (O’Reilly)
  • R for Everyone: Advanced Analytics and Graphics by Jared Lander (Addison-Wesley)
  • The Art of R Programming: A Tour of Statistical Software Design by Norman Matloff (No Starch Press)
  • Python for Data Analysis by Wes McKinney (O’Reilly) Data Analysis and Statistical Inference
  • Statistical Inference by George Casella and Roger L. Berger (Cengage Learning)
  • Bayesian Data Analysis by Andrew Gelman, et al. (Chapman & Hall)
  • Data Analysis Using Regression and Multilevel/Hierarchical Models by Andrew Gelman and Jennifer Hill (Cambridge University Press)
  • Advanced Data Analysis from an Elementary Point of View by Cosma Shalizi (under contract with Cambridge University Press)
  • The Elements of Statistical Learning: Data Mining, Inference and Prediction by Trevor Hastie, Robert Tibshirani, and Jerome Friedman (Springer)

Artificial Intelligence and Machine Learning

  • Pattern Recognition and Machine Learning by Christopher Bishop (Springer)
  • Bayesian Reasoning and Machine Learning by David Barber (Cambridge University Press)
  • Programming Collective Intelligence by Toby Segaran (O’Reilly)
  • Artificial Intelligence: A Modern Approach by Stuart Russell and Peter Norvig (Prentice Hall)
  • Foundations of Machine Learning by Mehryar Mohri, Afshin Rostamizadeh, and Ameet Talwalkar (MIT Press)
  • Introduction to Machine Learning (Adaptive Computation and Machine Learning) by Ethem Alpaydim (MIT Press)

Experimental Design

  • Field Experiments by Alan S. Gerber and Donald P. Green (Norton)
  • Statistics for Experimenters: Design, Innovation, and Discovery by George E. P. Box, et al. (Wiley-Interscience)

Visualization

  • The Elements of Graphing Data by William Cleveland (Hobart Press)
  • Visualize This: The FlowingData Guide to Design, Visualization, and Statistics by Nathan Yau (Wiley)

List from http://www.amazon.ca/Doing-Data-Science-Straight-Frontline/dp/1449358659

Pinterst Screenshot http://www.pinterest.com/dangleebits/books/

Website Mouse Tracking – Video

How mature is your web analytics team?

web-analytics-process-maturity-model

Mavericks OSX you broke my pydata stack

Screen Shot 2013-10-29 at 3.20.58 PM

Screen Shot 2013-10-29 at 3.32.11 PM
wget https://bitbucket.org/pypa/setuptools/raw/bootstrap/ez_setup.py -O – | sudo python
sudo python ez_setup.py
sudo easy_install pip
sudo pip install virtualenv
sudo pip install virtualenvwrapper
curl -O http://python-distribute.org/distribute_setup.py
sudo python distribute_setup.py
sudo pip install scipy –upgrade
sudo pip install numpy –upgrade
sudo pip install matplotlib –upgrade
sudo pip install pyzmq –upgrade
sudo pip install tornado –upgrade
sudo pip install pygments –upgrade
sudo pip install pandas –upgrade
sudo pip install jinja2 –upgrade

Process weblogs with Hadoop and Excel

Tutorial 10: How to Visualize Website Clickstream Data

http://hortonworks.com/hadoop-tutorial/how-to-visualize-website-clickstream-data/

Follow

Get every new post delivered to your Inbox.

Join 32 other followers