# imports
import os
from dotenv import load_dotenv
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import date as dt, timedelta
import string
import nltk
nltk.download([
"names",
"stopwords",
"state_union",
"twitter_samples",
"movie_reviews",
"averaged_perceptron_tagger",
"vader_lexicon",
"punkt",
], quiet=True)
from nltk.corpus import stopwords
from wordcloud import WordCloud
import re
import scipy.stats as stats


kaggle_df = pd.read_csv("cleandata.csv")
kaggle_df


# convert date and time to just date
kaggle_df["Date"] = pd.to_datetime(kaggle_df["Date"]).dt.date
kaggle_df.head()


print(kaggle_df["Date"].min(), kaggle_df["Date"].max())

2022-01-27 2022-10-27


load_dotenv(".env")
token = os.environ.get("BEARER_TOKEN")
headers = {"Authorization": "Bearer {}".format(token)}

ELON_ID = "44196397"

search_url = "https://api.twitter.com/2/users/{}/tweets".format(ELON_ID)

query_params = {
    'start_time': '2022-10-20T00:00:00Z', # Look at tweets after 10/20/22
    'end_time': '2022-12-12T23:59:59Z', # Look at tweets before 12/13/22
    'tweet.fields': 'text,created_at,public_metrics', # Retreive text of tweet, date posted, metrics(likes, retweets)
    'max_results': 100, # get 100 tweets every request (this is the max Twitter allows)
}

api_df = pd.DataFrame()

# because we cannot download all tweets at once, we will continue making requests using a technique called pagination
next_token = ""
while next_token is not None:
    curr_params = query_params.copy()
    if next_token != "":
        curr_params['pagination_token'] = next_token

    # make request to twitter api
    res = requests.request("GET", search_url, headers = headers, params = curr_params)
    res_json = res.json()

    # end of loop check - more on this below
    if 'data' not in res_json:
        next_token = None
        continue
    
    # remove fields Twitter gives back to us that we don't need and de-construct metrics dict
    tweets = res_json['data']
    for t in tweets:
        if 'edit_history_tweet_ids' in t:
            del t['edit_history_tweet_ids']
        if 'id' in t:
            del t['id']
        if 'public_metrics' in t:
            metrics = t['public_metrics']
            t['likes'] = metrics['like_count']
            t['retweets'] = metrics['retweet_count']
            del t['public_metrics']

    # create dataframe out of current 100 tweets then add to cumulative df
    curr_df = pd.DataFrame().from_dict(tweets)
    api_df = pd.concat([api_df, curr_df])

    # check if the next page exists, if not we end the loop by setting next_token to None
    if 'meta' in res_json:
        if 'next_token' in res_json['meta']:
            next_token = res_json['meta']['next_token']
        else:
            next_token = None
    else:
        next_token = None

print(len(api_df), "Tweets")
api_df.head()

1501 Tweets


# confirm tweets from the proper date range were retrieved
print("Min:", api_df["created_at"].min(), "  Max:", api_df["created_at"].max())

Min: 2022-10-20T00:10:14.000Z   Max: 2022-12-12T22:15:25.000Z


api_df[(api_df["text"].apply(lambda x: str(x).startswith("RT @"))) & (api_df["likes"] == 0)]


api_df = api_df[~((api_df["text"].apply(lambda x: str(x).startswith("RT @"))) & (api_df["likes"] == 0))]
api_df


api_df["date"] = pd.to_datetime(api_df["created_at"]).dt.date
api_df = api_df.drop(columns="created_at")
api_df.head()


kaggle_df = kaggle_df[~(kaggle_df["Date"] > dt(2022,10,19))]
kaggle_df


tmp_kaggle_df = kaggle_df.drop(columns="Cleaned_Tweets")
tmp_kaggle_df.columns = ["text", "retweets", "likes", "date"]

tmp_kaggle_df = tmp_kaggle_df[["text", "date", "likes", "retweets"]]

df = pd.concat([api_df,tmp_kaggle_df])
df


# confirm date range is 1/27 - 12/12
print("Min:", df["date"].min(), "  Max:", df["date"].max())

Min: 2022-01-27   Max: 2022-12-12


df["cleaned_text"] = df["text"].apply(lambda x: re.sub(r'@\w+', "", x))
df["cleaned_text"] = df["cleaned_text"].apply(lambda x: re.sub(r'https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)', "", x))
df["cleaned_text"] = df["cleaned_text"].apply(lambda x: x.replace("&amp;", "&"))
df.head()


min_date = df["date"].min()
max_date = df["date"].max()
time_delta = max_date - min_date

all_dates = [min_date + timedelta(x) for x in range(time_delta.days + 1)]
all_dates

diff_in_dates = set(all_dates) - set(df["date"].unique().tolist())
print("There are " + str(len(diff_in_dates)) + " days when Elon Musk did not post a tweet.")
diff_in_dates

There are 23 days when Elon Musk did not post a tweet.

{datetime.date(2022, 2, 24),
 datetime.date(2022, 2, 27),
 datetime.date(2022, 3, 23),
 datetime.date(2022, 4, 11),
 datetime.date(2022, 4, 12),
 datetime.date(2022, 4, 13),
 datetime.date(2022, 5, 5),
 datetime.date(2022, 6, 22),
 datetime.date(2022, 6, 23),
 datetime.date(2022, 6, 24),
 datetime.date(2022, 6, 25),
 datetime.date(2022, 6, 26),
 datetime.date(2022, 6, 27),
 datetime.date(2022, 6, 28),
 datetime.date(2022, 6, 29),
 datetime.date(2022, 6, 30),
 datetime.date(2022, 7, 1),
 datetime.date(2022, 7, 3),
 datetime.date(2022, 7, 9),
 datetime.date(2022, 7, 10),
 datetime.date(2022, 7, 17),
 datetime.date(2022, 8, 3),
 datetime.date(2022, 9, 2)}


dates_count_group = df.groupby(by="date", as_index=False).count()

plt.figure(figsize=(20,5))
plt.scatter(dates_count_group["date"], dates_count_group["text"])
plt.title("Amount of Elon Musk Tweets per Day in 2022")
plt.xlabel("Date")
plt.ylabel("Amount of Tweets")
plt.show()
display(dates_count_group["text"].describe())

count    297.000000
mean      13.474747
std       10.909031
min        1.000000
25%        5.000000
50%       11.000000
75%       19.000000
max       69.000000
Name: text, dtype: float64


dates_sum_group = df.groupby(by="date", as_index=False).sum()

plt.figure(figsize=(20,5))
plt.scatter(dates_sum_group["date"], dates_sum_group["likes"])
plt.title("Amount of Likes on Elon Musk's Tweets per Day in 2022")
plt.xlabel("Date")
plt.ylabel("Amount of Likes (per Ten Million)")
plt.show()
dates_sum_group["likes"].describe()

count    2.970000e+02
mean     1.139883e+06
std      1.658838e+06
min      3.664000e+03
25%      2.044820e+05
50%      5.500210e+05
75%      1.274600e+06
max      1.349880e+07
Name: likes, dtype: float64


df_likes_to_tweets = dates_sum_group.copy()
df_likes_to_tweets["text"] = dates_count_group["text"]
df_likes_to_tweets["likes/tweets"] = df_likes_to_tweets.apply(lambda x: x["likes"]/x["text"], axis=1)

display(df_likes_to_tweets[df_likes_to_tweets["likes/tweets"] > 500000])
display(df_likes_to_tweets["likes/tweets"].describe())

plt.figure(figsize=(20,5))
plt.scatter(df_likes_to_tweets["date"], df_likes_to_tweets["likes/tweets"])
plt.title("Amount of Likes per Tweet for Elon Musk's Tweets per Day in 2022")
plt.xlabel("Date")
plt.ylabel("Amount of Likes per Tweet")
plt.show()

count       297.000000
mean      79500.417900
std       98491.561431
min        3664.000000
25%       26224.045455
50%       53479.363636
75%       98968.285714
max      867788.125000
Name: likes/tweets, dtype: float64


# create stopword list & wordcloud:
stops = set(stopwords.words('english') + list(string.punctuation))
# extra punctuation added to stopwords (weird ASCII chars, etc.)
stops.add("amp")
stops.add("…")
stops.add("’")
stops.add("“")
stops.add("”")

# create a single string of all of the tweets
all_text = " ".join(tweet.lower() for tweet in df["cleaned_text"])
wordcloud = WordCloud(stopwords=stops).generate(all_text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


all_tweets = df["cleaned_text"].str.cat(sep=" ").strip()
words = [word.lower() for word in nltk.word_tokenize(all_tweets) if word not in stops]

finder = nltk.collocations.TrigramCollocationFinder.from_words(words)
finder.ngram_fd.most_common(10)

[(('woke', 'mind', 'virus'), 6),
 (('make', 'life', 'multiplanetary'), 6),
 (('tesla', 'ai', 'day'), 6),
 (('the', 'new', 'york'), 5),
 (('new', 'york', 'times'), 5),
 (('result', 'account', 'suspension'), 5),
 (('needed', 'make', 'life'), 4),
 (('incitement', 'violence', 'result'), 4),
 (('the', 'twitter', 'files'), 4),
 (('hate', 'speech', 'impressions'), 4)]


from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

scores = df["cleaned_text"].apply(lambda x: sia.polarity_scores(x)["compound"])

df["neg"] = scores.apply(lambda x: 1 if x < -0.05 else 0)
df["neu"] = scores.apply(lambda x: 1 if x >= -0.05 and x <= 0.05 else 0)
df["pos"] = scores.apply(lambda x: 1 if x > 0.05 else 0)


values = [df["neg"].sum(), df["neu"].sum(), df["pos"].sum()]
plt.bar(x=["Negative", "Neutral", "Positive"], height=values)
plt.title("Sentiment of Elon Musk's Tweets in 2022")
plt.xlabel("Sentiment")
plt.ylabel("Number of Tweets")
plt.show()


labels = ['January','February','March','April','May','June','July','August','September','October','November','December']
pos = [0]*12
neg = [0]*12
neutral = [0]*12
for index, row in df.iterrows():
    date = int(row['date'].month)
    if row['pos'] == 1:
        pos[date - 1] = pos[date - 1] + 1
    elif row['neg'] == 1:
        neg[date - 1] = neg[date - 1] + 1
    elif row['neu'] == 1:
        neutral[date - 1] = neutral[date - 1] + 1

x = np.arange(len(labels))  
width = 0.3  

fig, ax = plt.subplots()
fig.set_figheight(10)
fig.set_figwidth(15)
rects1 = ax.bar(x - width, neg, width, label='Negative', color='red')
rects2 = ax.bar(x, neutral, width, label='Neutral', color='blue')
rects3 = ax.bar(x + width, pos, width, label='Positive', color='green')

# text for labels, title and custom x,y-axis tick labels
ax.set_ylabel('Tweets')
ax.set_title('Sentiment Analysis')
ax.set_xticks(x, labels)
ax.legend()

ax.bar_label(rects1, padding=3)
ax.bar_label(rects2, padding=3)
ax.bar_label(rects3, padding=3)

plt.show()


labels = ['January','February','March','April','May','June','July','August','September','October','November','December']
total = [0]*12
for index in range(0,len(pos)):
    total[index] = pos[index] + neg[index] + neutral[index]
    pos[index] = float(pos[index])/total[index]
    neg[index] = float(neg[index])/total[index]
    neutral[index] = float(neutral[index])/total[index]

x = np.arange(len(labels))  
width = 0.3 

fig, ax = plt.subplots()
fig.set_figheight(10)
fig.set_figwidth(15)

rand = [0]*12
for index in range(0,len(pos)):
    rand[index] = neg[index]+neutral[index]
plt.bar(x, neg, color='r')
plt.bar(x, neutral, bottom=neg, color='b')
plt.bar(x, pos, bottom=rand,color='g')

# text for labels, title and custom x,y-axis tick labels
plt.ylabel('Percentage of Tweets')
plt.title('Sentiment Analysis')
plt.xticks(x, labels)
plt.legend(['Negative','Neutral','Positive'])

plt.show()


df_test1 = df[(df["date"] >= dt(2022, 4, 1)) & (df["date"] <= dt(2022, 4, 11))]
df_test2 = df[(df["date"] >= dt(2022, 10, 25)) & (df["date"] <= dt(2022, 11, 4))]
df_test3 = df[(df["date"] >= dt(2022, 7, 7)) & (df["date"] <= dt(2022, 7, 17))]


neg = list()
pos = list()
neu = list()
for d in [df_test1,df_test2,df_test3]:
    neg_val = d.groupby(by=['neg']).count()
    pos_val = d.groupby(by=['pos']).count()
    neu_val = d.groupby(by=['neu']).count()
    neg.append(neg_val['text'][1])
    pos.append(pos_val['text'][1])
    neu.append(neu_val['text'][1])

labels = ['BoD','Acquire','Normal']
x = np.arange(len(labels))  
width = 0.3  

fig, ax = plt.subplots()
fig.set_figheight(6)
fig.set_figwidth(8)

# text for labels, title and custom x,y-axis tick labels
ax.set_ylabel('Tweets')
ax.set_xlabel('Timeframe')
ax.set_title('Sentiment Analysis')
ax.set_xticks(x, labels)

rects1 = ax.bar(x - width, neg, width, label='Negative', color='red')
rects2 = ax.bar(x, neu, width, label='Neutral', color='blue')
rects3 = ax.bar(x + width, pos, width, label='Positive', color='green')
ax.bar_label(rects1, padding=3)
ax.bar_label(rects2, padding=3)
ax.bar_label(rects3, padding=3)
ax.legend(['Negative','Neutral','Positive'])

plt.show()


neg_3_groups = neg.copy()
neu_3_groups = neu.copy()
pos_3_groups = pos.copy()


neg_val = df_test3.groupby(by=['neg']).count()
pos_val = df_test3.groupby(by=['pos']).count()
neu_val = df_test3.groupby(by=['neu']).count()
neg_3_groups.append(neg_val['text'][1])
pos_3_groups.append(pos_val['text'][1])
neu_3_groups.append(neu_val['text'][1])

groups = np.transpose(np.array([neg_3_groups,neu_3_groups,pos_3_groups]))

# anova test for 3-way t-test
anova = stats.f_oneway(groups[0],groups[1],groups[2])
print("F-statistic:", anova[0])
print("p-Value:", anova[1])

F-statistic: 1.912090163934427
p-Value: 0.2278057985849194

	Tweets	Retweets	Likes	Date	Cleaned_Tweets
0	@PeterSchiff 🤣 thanks	209	7021	2022-10-27 16:17:39	thanks
1	@ZubyMusic Absolutely	755	26737	2022-10-27 13:19:25	Absolutely
2	Dear Twitter Advertisers https://t.co/GMwHmInPAS	55927	356623	2022-10-27 13:08:00	Dear Twitter Advertisers
3	Meeting a lot of cool people at Twitter today!	9366	195546	2022-10-26 21:39:32	Meeting a lot of cool people at Twitter today!
4	Entering Twitter HQ – let that sink in! https:...	145520	1043592	2022-10-26 18:45:58	Entering Twitter HQ – let that sink in!
...	...	...	...	...	...
2663	@LimitingThe @baglino Just that manganese is a...	171	3173	2022-01-27 22:01:06	Just that manganese is an alternative to iron ...
2664	@incentives101 @ICRicardoLara Exactly	145	4234	2022-01-27 21:23:20	Exactly
2665	@ICRicardoLara Your policies are directly resp...	421	6144	2022-01-27 21:13:57	Your policies are directly responsible for the...
2666	@ICRicardoLara You should be voted out of office	484	7029	2022-01-27 21:12:27	You should be voted out of office
2667	CB radios are free from govt/media control	11302	113429	2022-01-27 21:00:09	CB radios are free from govt/media control

	Tweets	Retweets	Likes	Date	Cleaned_Tweets
0	@PeterSchiff 🤣 thanks	209	7021	2022-10-27	thanks
1	@ZubyMusic Absolutely	755	26737	2022-10-27	Absolutely
2	Dear Twitter Advertisers https://t.co/GMwHmInPAS	55927	356623	2022-10-27	Dear Twitter Advertisers
3	Meeting a lot of cool people at Twitter today!	9366	195546	2022-10-26	Meeting a lot of cool people at Twitter today!
4	Entering Twitter HQ – let that sink in! https:...	145520	1043592	2022-10-26	Entering Twitter HQ – let that sink in!

	text	created_at	likes	retweets
0	@Lukewearechange @AndrewPollackFL Accurate	2022-12-12T22:15:25.000Z	39125	2888
1	@BillyM2k Haha totally. High quality bots are ...	2022-12-12T21:40:20.000Z	11895	598
2	@rupasubramanya @TheFP Exactly	2022-12-12T21:34:51.000Z	35009	2402
3	@TRHLofficial @ggreenwald Indeed	2022-12-12T21:14:47.000Z	11755	661
4	@micsolana The wording is mine lol	2022-12-12T20:56:45.000Z	78760	2608

	text	created_at	likes	retweets
57	RT @SpaceX: Deployment of ispace’s HAKUTO-R Mi...	2022-12-11T08:27:27.000Z	0	3989
63	RT @SpaceX: Watch Falcon 9 launch ispace’s HAK...	2022-12-11T07:56:19.000Z	0	3897
64	RT @SpaceX: Falcon 9’s first stage has landed ...	2022-12-11T07:56:16.000Z	0	3528
65	RT @SpaceX: Liftoff! https://t.co/FEenmAJmOz	2022-12-11T07:56:14.000Z	0	5340
69	RT @CommunityNotes: Beginning today, Community...	2022-12-11T01:45:21.000Z	0	4556
...	...	...	...	...
59	RT @Tesla: Vote for new Supercharger locations...	2022-10-21T21:46:05.000Z	0	2315
60	RT @Tesla: Our most advanced paint system yet,...	2022-10-21T21:45:15.000Z	0	1532
70	RT @Tesla: https://t.co/CqbkkORG70	2022-10-21T06:01:32.000Z	0	2655
94	RT @SpaceX: Deployment of 54 Starlink satellit...	2022-10-20T16:05:22.000Z	0	2138
2	RT @Tesla: 10 years of Supercharging.\n\n46 co...	2022-10-20T01:12:50.000Z	0	4408

	text	created_at	likes	retweets
0	@Lukewearechange @AndrewPollackFL Accurate	2022-12-12T22:15:25.000Z	39125	2888
1	@BillyM2k Haha totally. High quality bots are ...	2022-12-12T21:40:20.000Z	11895	598
2	@rupasubramanya @TheFP Exactly	2022-12-12T21:34:51.000Z	35009	2402
3	@TRHLofficial @ggreenwald Indeed	2022-12-12T21:14:47.000Z	11755	661
4	@micsolana The wording is mine lol	2022-12-12T20:56:45.000Z	78760	2608
...	...	...	...	...
98	@marenkahnert @jasondebolt Exactly	2022-10-20T07:57:25.000Z	1932	79
99	@Teslarati @13ericralph31 SpaceX has more acti...	2022-10-20T07:30:52.000Z	16128	1385
0	@jasondebolt The media reports with great fanf...	2022-10-20T06:52:01.000Z	23000	1489
1	@jakebrowatzke @andyjayhawk 🤣	2022-10-20T06:38:51.000Z	1949	78
3	@Teslarati @JohnnaCrider1 Accelerating sustain...	2022-10-20T00:10:14.000Z	18072	1408

CMSC320 Final Project - The Musk Effect¶

Introduction¶

Data Collection¶

Data Curation¶

Exploratory Data Analysis¶

Hypothesis Testing¶

Conclusion¶

	Tweets	Retweets	Likes	Date	Cleaned_Tweets
95	@westcoastbill Will require truly exceptional ...	745	11060	2022-10-19	Will require truly exceptional execution, but ...
96	I will not let you down, no matter what it takes	35111	392237	2022-10-19	I will not let you down, no matter what it takes
97	@DirtyTesLa Awesome	88	2381	2022-10-19	Awesome
98	We even did a Starlink video call on one airpl...	2060	37029	2022-10-19	We even did a Starlink video call on one airpl...
99	Vox Populi Vox Dei	5709	53880	2022-10-19	Vox Populi Vox Dei
...	...	...	...	...	...
2663	@LimitingThe @baglino Just that manganese is a...	171	3173	2022-01-27	Just that manganese is an alternative to iron ...
2664	@incentives101 @ICRicardoLara Exactly	145	4234	2022-01-27	Exactly
2665	@ICRicardoLara Your policies are directly resp...	421	6144	2022-01-27	Your policies are directly responsible for the...
2666	@ICRicardoLara You should be voted out of office	484	7029	2022-01-27	You should be voted out of office
2667	CB radios are free from govt/media control	11302	113429	2022-01-27	CB radios are free from govt/media control

	date	likes	retweets	text	likes/tweets
82	2022-04-25	6942305	789375	8	867788.125000
83	2022-04-26	3910184	401710	6	651697.333333
85	2022-04-28	13498798	1307664	16	843674.875000
251	2022-10-28	8101499	882314	15	540099.933333