BigSnarf blog

Infosec FTW

Cleaning tweetstream or twitter archive

import string
DEFAULT_STOP_WORDS = [ 

  'a', 'about', 'also', 'am', 'an', 'and', 'any', 'are', 'as', 'at', 'be',

  'but', 'by', 'can', 'com', 'did', 'do', 'does', 'for', 'from', 'had',

  'has', 'have', 'he', "he'd", "he'll", "he's", 'her', 'here', 'hers',

  'him', 'his', 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is',

  'it', "it's", 'its', 'just', 'me', 'mine', 'my', 'of', 'on', 'or', 'org',

  'our', 'ours', 'she', "she'd", "she'll", "she's", 'some', 'than', 'that',

  'the', 'their', 'them', 'then', 'there', 'these', 'they', "they'd",

  "they'll", "they're", 'this', 'those', 'to', 'us', 'was', 'we', "we'd", 

  "we'll", "we're", 'were', 'what', 'where', 'which', 'who', 'will', 'with',

  'would', 'you', 'your', 'yours',

  ]
def clean_data(data):

  for char in string.punctuation:

    data = data.replace(char, "")

  return data

def clean_stop_words(data):

  for word in DEFAULT_STOP_WORDS:

    data = data.replace(word, "")
  return data

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

%d bloggers like this: