BigSnarf blog

Infosec FTW

Cleaning tweetstream or twitter archive

import string

  'a', 'about', 'also', 'am', 'an', 'and', 'any', 'are', 'as', 'at', 'be',

  'but', 'by', 'can', 'com', 'did', 'do', 'does', 'for', 'from', 'had',

  'has', 'have', 'he', "he'd", "he'll", "he's", 'her', 'here', 'hers',

  'him', 'his', 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is',

  'it', "it's", 'its', 'just', 'me', 'mine', 'my', 'of', 'on', 'or', 'org',

  'our', 'ours', 'she', "she'd", "she'll", "she's", 'some', 'than', 'that',

  'the', 'their', 'them', 'then', 'there', 'these', 'they', "they'd",

  "they'll", "they're", 'this', 'those', 'to', 'us', 'was', 'we', "we'd", 

  "we'll", "we're", 'were', 'what', 'where', 'which', 'who', 'will', 'with',

  'would', 'you', 'your', 'yours',

def clean_data(data):

  for char in string.punctuation:

    data = data.replace(char, "")

  return data

def clean_stop_words(data):

  for word in DEFAULT_STOP_WORDS:

    data = data.replace(word, "")
  return data

Leave a Reply

Fill in your details below or click an icon to log in: Logo

You are commenting using your account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

%d bloggers like this: