From 3f1ec95529c082df88083f4a69d72e938ac5d8b8 Mon Sep 17 00:00:00 2001 From: Shao Wang Date: Thu, 30 Aug 2018 19:29:21 +1000 Subject: [PATCH] Upload Yiran's code. --- README.md | 4 +- mlweb_backend/preprocessor.py | 29 +++++++++++++ mlweb_backend/processor.py | 69 +++++++++++++++++++++++++++++++ mlweb_backend/streaming.py | 76 +++++++++++++++++++++++++++++++++++ mlweb_backend/web_utils.py | 32 +++++++++++++++ 5 files changed, 209 insertions(+), 1 deletion(-) create mode 100644 mlweb_backend/preprocessor.py create mode 100644 mlweb_backend/processor.py create mode 100644 mlweb_backend/streaming.py create mode 100644 mlweb_backend/web_utils.py diff --git a/README.md b/README.md index 5b0193d..bc12b74 100644 --- a/README.md +++ b/README.md @@ -1 +1,3 @@ -# ml-web \ No newline at end of file +# ml-web + +Twitter analysis. \ No newline at end of file diff --git a/mlweb_backend/preprocessor.py b/mlweb_backend/preprocessor.py new file mode 100644 index 0000000..c509615 --- /dev/null +++ b/mlweb_backend/preprocessor.py @@ -0,0 +1,29 @@ +import pandas as pd +import sqlite3 +from sqlalchemy import create_engine + +from web_utils import clean_tweet, get_db_data + +# get the tweets from streaming database +tweets = get_db_data('?.db', 'SELECT tweet, created_at FROM ') + +# create the preprocessed pandas dataframe for tweets +clean_tweets = [] +tweets_datetime = [] +for tweet in tweets: + clean_tweets.append(clean_tweet(tweet[0])) + tweets_datetime.append(tweet[1]) + +preprocessed_dict = {'tweet': clean_tweets, 'date': tweets_datetime} +preprocessed_df = pd.DataFrame(preprocessed_dict) +preprocessed_df.date = pd.to_datetime(preprocessed_df.date) + + +# store the preprocessed tweets into database +try: + engine = sqlite3.connect('preprocessed_tweets.db') + preprocessed_df.to_sql('preprocessed_tweets', con=engine, if_exists='replace') + +except: + engine = create_engine('sqlite:///preprocessed_tweets.db') + preprocessed_df.to_sql('preprocessed_tweets', con=engine) diff --git a/mlweb_backend/processor.py b/mlweb_backend/processor.py new file mode 100644 index 0000000..3752661 --- /dev/null +++ b/mlweb_backend/processor.py @@ -0,0 +1,69 @@ +import datetime, pickle + +import spacy +from gensim import models, corpora + +from web_utils import get_db_data + +# get the current time(utc) and the one-hour-before current time +now = datetime.datetime.utcnow() +now_str = '\'' + str(now) + '\'' +before = now - datetime.timedelta(hours=1) +before_str = '\'' + str(before) + '\'' + +# select the tweets between these dates +execution = 'SELECT tweet FROM preprocessed_tweets WHERE date BETWEEN ' + before_str + ' AND ' + now_str +tweets_clean = get_db_data('preprocessed_tweets.db', execution) + + +# generate nlp object +def generate_nlp_object(): + nlp = spacy.load('en') + + texts = [] + for tweet in tweets_clean: + text = [] + doc = nlp(tweet[0]) + + my_stop_words = [''] + + for stopword in my_stop_words: + stop = nlp.vocab[stopword] + stop.is_stop = True + stop = nlp.vocab[stopword.lower()] + stop.is_stop = True + + for w in doc: + if not w.is_stop and not w.is_punct and not w.like_num and w.lemma_ != '-PRON-' and len(w) > 3: + text.append(w.lemma_) + + texts.append(text) + + bigram = models.Phrases(texts) + texts = [bigram[line] for line in texts] + dictionary = corpora.Dictionary(texts) + # dictionary.filter_extremes(no_below=10, no_above=0.8) + corpus = [dictionary.doc2bow(text) for text in texts] + + return texts, dictionary, corpus + + +texts, dictionary, corpus = generate_nlp_object() + + +# Use LDA to generate topics +def generate_topics_lda(num_topics=1): + lda_model = models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary) + return lda_model + + +model = generate_topics_lda() +topics_list = model.show_topics() + +# store the generated topics into pickle file +now_name = str(now.month) + str(now.day) + str(now.hour) + str(now.minute) + str(now.second) +before_name = str(before.month) + str(before.day) + str(before.hour) + str(before.minute) + str(before.second) +pickle_name = now_name + '-' + before_name + '.pkl' +pickle_out = open(pickle_name, 'wb') +pickle.dump(topics_list, pickle_out) +pickle_out.close() diff --git a/mlweb_backend/streaming.py b/mlweb_backend/streaming.py new file mode 100644 index 0000000..b8d904c --- /dev/null +++ b/mlweb_backend/streaming.py @@ -0,0 +1,76 @@ +import logging + +from sqlalchemy import Column, Integer, String +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker +from tweepy import StreamListener +from tweepy import Stream + +from web_utils import set_twitter_app_tokens + +# set the log file +logging.basicConfig(filename="streaming.log", + level=logging.DEBUG, + format="%(asctime)s:%(levelname)s:%(message)s") + + +base = declarative_base() +table_name = '' +db_name = '' + + +# set the database class to add streaming tweet +class TweetDb(base): + __tablename__ = table_name + + index = Column(Integer, primary_key=True) + tweet = Column(String(500)) + created_at = Column(String(300)) + user_name = Column(String(300)) + + +# create the session for database +engine = create_engine('sqlite:///' + db_name + '.db') +base.metadata.create_all(engine) +db_session = sessionmaker(bind=engine) +session = db_session() + + +# set the streaming listener +class MyStreamListener(StreamListener): + def __init__(self, sess): + super().__init__() + self.session = sess + + def on_status(self, status): + try: + if status.retweeted_status: + return + except AttributeError: + pass + + new_data = TweetDb(tweet=status.text, + created_at=status.created_at, + user_name=status.user.screen_name) + + self.session.add(new_data) + self.session.commit() + + def on_error(self, status_code): + if status_code == 420: + return False + + +# input the twitter app tokens +twitter_app_tokens = dict(consumer_key='', + consumer_secret='', + access_token='', + access_token_secret='') + +# run the streaming +api = set_twitter_app_tokens(twitter_app_tokens) +listener = MyStreamListener(session) +my_stream = Stream(auth = api.auth, listener=listener) +track_words = [''] +my_stream.filter(track=track_words, languages=["en"]) \ No newline at end of file diff --git a/mlweb_backend/web_utils.py b/mlweb_backend/web_utils.py new file mode 100644 index 0000000..9e56a4a --- /dev/null +++ b/mlweb_backend/web_utils.py @@ -0,0 +1,32 @@ +import re +import sqlite3 + +from tweepy import OAuthHandler, API + + +# set twitter app tokens +def set_twitter_app_tokens(tokens): + + auth = OAuthHandler(tokens['consumer_key'], tokens['consumer_secret']) + auth.set_access_token(tokens['access_token'], tokens['access_token_secret']) + api = API(auth) + + return api + + +# clean tweet +def clean_tweet(text): + return ' '.join(re.sub("(@[A-Za-z0-9]+)|#[A-Za-z0-9]+|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", text).split()) + + +# connect the database and return all data +def get_db_data(db_path, execute): + + conn = sqlite3.connect(db_path) + c = conn.cursor() + c.execute(execute) + tweets = c.fetchall() + conn.close() + + return tweets +