8000 Upload Yiran's code. by shao-wang-me · Pull Request #1 · shao-wang-me/ml-web · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content
This repository was archived by the owner on Sep 28, 2021. It is now read-only.

Upload Yiran's code. #1

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
# ml-web
# ml-web

Twitter analysis.
29 changes: 29 additions & 0 deletions mlweb_backend/preprocessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import pandas as pd
import sqlite3
from sqlalchemy import create_engine

from web_utils import clean_tweet, get_db_data

# get the tweets from streaming database
tweets = get_db_data('?.db', 'SELECT tweet, created_at FROM ')

# create the preprocessed pandas dataframe for tweets
clean_tweets = []
tweets_datetime = []
for tweet in tweets:
clean_tweets.append(clean_tweet(tweet[0]))
tweets_datetime.append(tweet[1])

preprocessed_dict = {'tweet': clean_tweets, 'date': tweets_datetime}
preprocessed_df = pd.DataFrame(preprocessed_dict)
preprocessed_df.date = pd.to_datetime(preprocessed_df.date)


# store the preprocessed tweets into database
try:
engine = sqlite3.connect('preprocessed_tweets.db')
preprocessed_df.to_sql('preprocessed_tweets', con=engine, if_exists='replace')

except:
engine = create_engine('sqlite:///preprocessed_tweets.db')
preprocessed_df.to_sql('preprocessed_tweets', con=engine)
69 changes: 69 additions & 0 deletions mlweb_backend/processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import datetime, pickle

import spacy
from gensim import models, corpora

from web_utils import get_db_data

# get the current time(utc) and the one-hour-before current time
now = datetime.datetime.utcnow()
now_str = '\'' + str(now) + '\''
before = now - datetime.timedelta(hours=1)
before_str = '\'' + str(before) + '\''

# select the tweets between these dates
execution = 'SELECT tweet FROM preprocessed_tweets WHERE date BETWEEN ' + before_str + ' AND ' + now_str
tweets_clean = get_db_data('preprocessed_tweets.db', execution)


# generate nlp object
def generate_nlp_object():
nlp = spacy.load('en')

texts = []
for tweet in tweets_clean:
text = []
doc = nlp(tweet[0])

my_stop_words = ['']

for stopword in my_stop_words:
stop = nlp.vocab[stopword]
stop.is_stop = True
stop = nlp.vocab[stopword.lower()]
stop.is_stop = True

for w in doc:
if not w.is_stop and not w.is_punct and not w.like_num and w.lemma_ != '-PRON-' and len(w) > 3:
text.append(w.lemma_)

texts.append(text)

bigram = models.Phrases(texts)
texts = [bigram[line] for line in texts]
dictionary = corpora.Dictionary(texts)
# dictionary.filter_extremes(no_below=10, no_above=0.8)
corpus = [dictionary.doc2bow(text) for text in texts]

return texts, dictionary, corpus


texts, dictionary, corpus = generate_nlp_object()


# Use LDA to generate topics
def generate_topics_lda(num_topics=1):
lda_model = models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
return lda_model


model = generate_topics_lda()
topics_list = model.show_topics()

# store the generated topics into pickle file
now_name = str(now.month) + str(now.day) + str(now.hour) + str(now.minute) + str(now.second)
before_name = str(before.month) + str(before.day) + str(before.hour) + str(before.minute) + str(before.second)
pickle_name = now_name + '-' + before_name + '.pkl'
pickle_out = open(pickle_name, 'wb')
pickle.dump(topics_list, pickle_out)
pickle_out.close()
76 changes: 76 additions & 0 deletions mlweb_backend/streaming.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import logging

from sqlalchemy import Column, Integer, String
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from tweepy import StreamListener
from tweepy import Stream

from web_utils import set_twitter_app_tokens

# set the log file
logging.basicConfig(filename="streaming.log",
level=logging.DEBUG,
format="%(asctime)s:%(levelname)s:%(message)s")


base = declarative_base()
table_name = ''
db_name = ''


# set the database class to add streaming tweet
class TweetDb(base):
__tablename__ = table_name

index = Column(Integer, primary_key=True)
tweet = Column(String(500))
created_at = Column(String(300))
user_name = Column(String(300))


# create the session for database
engine = create_engine('sqlite:///' + db_name + '.db')
base.metadata.create_all(engine)
db_session = sessionmaker(bind=engine)
session = db_session()


# set the streaming listener
class MyStreamListener(StreamListener):
def __init__(self, sess):
super().__init__()
self.session = sess

def on_status(self, status):
try:
if status.retweeted_status:
return
except AttributeError:
pass

new_data = TweetDb(tweet=status.text,
created_at=status.created_at,
user_name=status.user.screen_name)

self.session.add(new_data)
self.session.commit()

def on_error(self, status_code):
if status_code == 420:
return False


# input the twitter app tokens
twitter_app_tokens = dict(consumer_key='',
consumer_secret='',
access_token='',
access_token_secret='')

# run the streaming
api = set_twitter_app_tokens(twitter_app_tokens)
listener = MyStreamListener(session)
my_stream = Stream(auth = api.auth, listener=listener)
track_words = ['']
my_stream.filter(track=track_words, languages=["en"])
32 changes: 32 additions & 0 deletions mlweb_backend/web_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import re
import sqlite3

from tweepy import OAuthHandler, API


# set twitter app tokens
def set_twitter_app_tokens(tokens):

auth = OAuthHandler(tokens['consumer_key'], tokens['consumer_secret'])
auth.set_access_token(tokens['access_token'], tokens['access_token_secret'])
api = API(auth)

return api


# clean tweet
def clean_tweet(text):
return ' '.join(re.sub("(@[A-Za-z0-9]+)|#[A-Za-z0-9]+|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", text).split())


# connect the database and return all data
def get_db_data(db_path, execute):

conn = sqlite3.connect(db_path)
c = conn.cursor()
c.execute(execute)
tweets = c.fetchall()
conn.close()

return tweets

0