# Import needed libraries

import pandas as pd # data wrangling
import numpy as np # mathematical calculations
import time # timer functions
import json # work with json files
import os # write files to disk 
from PIL import Image # read Image files
from io import BytesIO # decode image files
import matplotlib.pyplot as plt # visualizations


# Read the archived data provided into pandas dataframe

enhanced_twitter_archive = pd.read_csv("twitter-archive-enhanced.csv")


# View sample of archive data
enhanced_twitter_archive.sample(3)


twitter_arc_ids = enhanced_twitter_archive["tweet_id"]

twitter_arc_ids

0       892420643555336193
1       892177421306343426
2       891815181378084864
3       891689557279858688
4       891327558926688256
               ...        
2351    666049248165822465
2352    666044226329800704
2353    666033412701032449
2354    666029285002620928
2355    666020888022790149
Name: tweet_id, Length: 2356, dtype: int64


import requests

r = requests.get("https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv")


r.status_code

200


# Download the response into tsv file

with open("image_predictions.tsv", mode='wb') as file:
    file.write(r.content)


#  read provided image predictions into dataframe

image_predictions = pd.read_csv("image_predictions.tsv", sep="\t")


# Import the tweepy library and Authenticate

import tweepy

consumer_key = 'redacted'
consumer_secret = 'redacted'
access_token = 'redacted'
access_secret = 'redacted'

auth = tweepy.OAuth1UserHandler(
   consumer_key, consumer_secret, access_token, access_secret
)

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)


# Read content of the api response and write into json format 
counter = 0
failures = {}
begin_time = time.time()

with open("tweet_json.txt", "w") as tweet_json:
    for tweet_id in twitter_arc_ids:
        counter += 1
        print(f"Tweet {counter} : {tweet_id}")
        try:
            tweet = api.get_status(tweet_id, tweet_mode="extended")
            print("Sucess")
            json.dump(tweet._json, tweet_json)
            tweet_json.write("\n")
        except tweepy.errors.HTTPException as e:
            print("Fail")
            failures[tweet_id] = e
            pass

run_time = round(time.time() - begin_time, 2)


print(f"It took {run_time} seconds to get {counter} tweets")

It took 1639.08 seconds to get 2356 tweets


# read tweet_json file line by line for retweet id, retweeted status, retweet counts and favorite counts
tweets_info_list = []

with open("tweet_json.txt", "r") as f:
    for line in f:
        tweets_info_dict = json.loads(line)
        tweet_id = tweets_info_dict["id"]
        tweet_retweeted = tweets_info_dict["retweeted"]
        tweet_retweet_count = tweets_info_dict["retweet_count"]
        tweet_fav_count = tweets_info_dict["favorite_count"]
        tweets_info_list.append(
            {"tweet_id" : tweet_id,
            "tweet_retweeted" : tweet_retweeted,
            "tweet_retweet_count": tweet_retweet_count,
            "tweet_fav_count" : tweet_fav_count}
        )
        
type(tweets_info_list)

list


# Check for all keys in the loaded tweets info dictionary
tweets_info_dict.keys()

dict_keys(['created_at', 'id', 'id_str', 'full_text', 'truncated', 'display_text_range', 'entities', 'extended_entities', 'source', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place', 'contributors', 'is_quote_status', 'retweet_count', 'favorite_count', 'favorited', 'retweeted', 'possibly_sensitive', 'possibly_sensitive_appealable', 'lang'])


# Convert data downloaded from twitter api to pandas dataframe
twitter_api_data = pd.DataFrame(tweets_info_list)


enhanced_twitter_archive


image_predictions


twitter_api_data


# Get random samples from dataset 1
enhanced_twitter_archive.sample(4)


# get general info on dataset1
enhanced_twitter_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 non-null   object 
 14  floofer                     2356 non-null   object 
 15  pupper                      2356 non-null   object 
 16  puppo                       2356 non-null   object 
dtypes: float64(4), int64(3), object(10)
memory usage: 313.0+ KB


# get the number of names in the dataset that are duplicated.
enhanced_twitter_archive["name"].duplicated().sum()

1399


# get the counts of the names in dataset
enhanced_twitter_archive["name"].value_counts()

None          745
a              55
Charlie        12
Cooper         11
Lucy           11
             ... 
Dex             1
Ace             1
Tayzie          1
Grizzie         1
Christoper      1
Name: name, Length: 957, dtype: int64


enhanced_twitter_archive["name"].nunique()

957


enhanced_twitter_archive["name"].sample(20)

1123     Crystal
652         BeBe
2328        None
1553    Clarence
1640      Sweets
1451       Wyatt
1268       Cecil
1043        None
1067       Baloo
1814        None
881         None
99          None
678       Stella
2065        None
1418     Jessiga
1626    Theodore
2149        None
1093        None
1414     Cuddles
656       Maddie
Name: name, dtype: object


# get the number of observations that are not retweets
enhanced_twitter_archive["retweeted_status_user_id"].isnull().sum()

2175


# descriptive summary of numerical variables for dataset 1
enhanced_twitter_archive.describe().style.format("{0:,.0f}")


# check dataset for extremely high ratings that might constitute bias 
enhanced_twitter_archive.query('rating_numerator > 50').info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18 entries, 188 to 2074
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    18 non-null     int64  
 1   in_reply_to_status_id       5 non-null      float64
 2   in_reply_to_user_id         5 non-null      float64
 3   timestamp                   18 non-null     object 
 4   source                      18 non-null     object 
 5   text                        18 non-null     object 
 6   retweeted_status_id         1 non-null      float64
 7   retweeted_status_user_id    1 non-null      float64
 8   retweeted_status_timestamp  1 non-null      object 
 9   expanded_urls               14 non-null     object 
 10  rating_numerator            18 non-null     int64  
 11  rating_denominator          18 non-null     int64  
 12  name                        18 non-null     object 
 13  doggo                       18 non-null     object 
 14  floofer                     18 non-null     object 
 15  pupper                      18 non-null     object 
 16  puppo                       18 non-null     object 
dtypes: float64(4), int64(3), object(10)
memory usage: 2.5+ KB


# view samples from the dataset
image_predictions.sample(5)


# assess summary of each series in dataset 2
image_predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


# download all images in the image prediction dataset

# Create a directory if it doesn't already exist
folder_name = 'dog_images'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

for x in range(0, len(image_predictions)):
    try:
        tweet_id = image_predictions["tweet_id"][x]
        image = requests.get(image_predictions["jpg_url"][x])
        Image.open(BytesIO(image.content)).save(folder_name + "/" + str(tweet_id) + '.' + "jpg")

    except Exception as e:
        print(f"Error with {tweet_id} : {e}")


# Get summary of numerical variables
image_predictions.describe().style.format("{0:,.0f}")


# Get information of dataset
twitter_api_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1771 entries, 0 to 1770
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   tweet_id             1771 non-null   int64
 1   tweet_retweeted      1771 non-null   bool 
 2   tweet_retweet_count  1771 non-null   int64
 3   tweet_fav_count      1771 non-null   int64
dtypes: bool(1), int64(3)
memory usage: 43.4 KB


# pick random samples from dataset
twitter_api_data.sample(10)


# Get statisticak summary for dataset 3
twitter_api_data.describe().style.format("{0:,.0f}")


# instantiate a function that will be used for dropping columns throughout the cleaning process.
def drop_df_cols(df, cols_to_drop):
    """
    this function takes in two parameters - df (a dataframe) and cols_to_drop (a list of columns) 
    and drops the columns.
    """
    for col in cols_to_drop:
        df.drop(columns=col, inplace=True)
    return df.head(2)


# Make copies of original pieces of data
enhanced_twitter_archive_clean = enhanced_twitter_archive.copy()
image_predictions_clean = image_predictions.copy()
twitter_api_data_clean = twitter_api_data.copy()


enhanced_twitter_archive_clean["timestamp"] = pd.to_datetime(enhanced_twitter_archive_clean["timestamp"])


enhanced_twitter_archive_clean["timestamp"].dtype

datetime64[ns, UTC]


# create a list of non-names found during visual assessment of dataset and replace each instance of the name.
list_non_names = ["a", "None", "all", "an", "by", "getting", "his",
                  "just", "life", "light", "mad", "my", "not", "old",
                  "one", "quite", "space", "the", "this", "unacceptable", "very"]

for item in list_non_names:
    enhanced_twitter_archive_clean["name"].replace(item, "Name Unavailable", inplace=True)


# replace "O" in names with "O'Malley"
enhanced_twitter_archive_clean["name"].replace("O", "O'Malley", inplace=True)


#  confirm there are no names like "None", "a", "an", etc in dataset.
enhanced_twitter_archive_clean["name"].value_counts()

Name Unavailable    848
Charlie              12
Oliver               11
Cooper               11
Lucy                 11
                   ... 
Beckham               1
Devón                 1
Gert                  1
Dex                   1
Christoper            1
Name: name, Length: 937, dtype: int64


# confirm there is no dog with the name "O" in dataset
enhanced_twitter_archive_clean.query('name == "O"')


#  mask dataframe to include only ratings below 50.
enhanced_twitter_archive_clean = enhanced_twitter_archive_clean.query('rating_numerator < 50') 
enhanced_twitter_archive_clean = enhanced_twitter_archive_clean.query('rating_denominator < 50')


# confirm there are no datasets with ratings higher than 50
enhanced_twitter_archive_clean.describe().style.format("{0:,.0f}")


# drop rows containing retweets by filtering them out
enhanced_twitter_archive_clean = enhanced_twitter_archive_clean[enhanced_twitter_archive_clean["retweeted_status_id"].isnull()]


#  check to be sure there are no longer any entries in the retweet columns
enhanced_twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2156 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype              
---  ------                      --------------  -----              
 0   tweet_id                    2156 non-null   int64              
 1   in_reply_to_status_id       73 non-null     float64            
 2   in_reply_to_user_id         73 non-null     float64            
 3   timestamp                   2156 non-null   datetime64[ns, UTC]
 4   source                      2156 non-null   object             
 5   text                        2156 non-null   object             
 6   retweeted_status_id         0 non-null      float64            
 7   retweeted_status_user_id    0 non-null      float64            
 8   retweeted_status_timestamp  0 non-null      object             
 9   expanded_urls               2102 non-null   object             
 10  rating_numerator            2156 non-null   int64              
 11  rating_denominator          2156 non-null   int64              
 12  name                        2156 non-null   object             
 13  doggo                       2156 non-null   object             
 14  floofer                     2156 non-null   object             
 15  pupper                      2156 non-null   object             
 16  puppo                       2156 non-null   object             
dtypes: datetime64[ns, UTC](1), float64(4), int64(3), object(9)
memory usage: 303.2+ KB


# drop columns that are not useful for analysis
cols_to_drop = ["source", "text", "expanded_urls", "in_reply_to_status_id", "in_reply_to_user_id", "retweeted_status_id", "retweeted_status_user_id", "retweeted_status_timestamp"]

drop_df_cols(enhanced_twitter_archive_clean, cols_to_drop)


# check to be sure dropped columns are no longer part of our dataframe
enhanced_twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2156 entries, 0 to 2355
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   tweet_id            2156 non-null   int64              
 1   timestamp           2156 non-null   datetime64[ns, UTC]
 2   rating_numerator    2156 non-null   int64              
 3   rating_denominator  2156 non-null   int64              
 4   name                2156 non-null   object             
 5   doggo               2156 non-null   object             
 6   floofer             2156 non-null   object             
 7   pupper              2156 non-null   object             
 8   puppo               2156 non-null   object             
dtypes: datetime64[ns, UTC](1), int64(3), object(5)
memory usage: 168.4+ KB


#  filter out rows have p1_dog = False.
image_predictions_clean = image_predictions_clean[image_predictions_clean["p1_dog"] == True]


# check to be sure there are no p1 predictions that are not dogs.
image_predictions_clean["p1_dog"].value_counts()

True    1532
Name: p1_dog, dtype: int64


# convert all cases to lower in all prediction columns
cols_to_convert = ["p1", "p2", "p3"]

for cols in cols_to_convert:
    image_predictions_clean[cols] = image_predictions_clean[cols].str.lower()


# check dataframe to be sure all predictions are now in lower case.
image_predictions_clean


# drop the jpg_url column
cols_to_drop = ["jpg_url"]

drop_df_cols(image_predictions_clean, cols_to_drop)


# check to be sure the img_url column is not longer part of our dataframe
image_predictions_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1532 entries, 0 to 2073
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  1532 non-null   int64  
 1   img_num   1532 non-null   int64  
 2   p1        1532 non-null   object 
 3   p1_conf   1532 non-null   float64
 4   p1_dog    1532 non-null   bool   
 5   p2        1532 non-null   object 
 6   p2_conf   1532 non-null   float64
 7   p2_dog    1532 non-null   bool   
 8   p3        1532 non-null   object 
 9   p3_conf   1532 non-null   float64
 10  p3_dog    1532 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(3)
memory usage: 112.2+ KB


# replace all values of the dog stages provided that are None with NaN.
cols_to_replace_none = ["doggo", "floofer", "pupper", "puppo"]

for col in cols_to_replace_none:
    enhanced_twitter_archive_clean[col] = enhanced_twitter_archive_clean[col].replace({"None":np.nan})


# check to be sure that all stages now have only the necessary stages
enhanced_twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2156 entries, 0 to 2355
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   tweet_id            2156 non-null   int64              
 1   timestamp           2156 non-null   datetime64[ns, UTC]
 2   rating_numerator    2156 non-null   int64              
 3   rating_denominator  2156 non-null   int64              
 4   name                2156 non-null   object             
 5   doggo               87 non-null     object             
 6   floofer             10 non-null     object             
 7   pupper              234 non-null    object             
 8   puppo               25 non-null     object             
dtypes: datetime64[ns, UTC](1), int64(3), object(5)
memory usage: 168.4+ KB


# combine all four columns, then drop the individual stages in separate columns

enhanced_twitter_archive_clean["dog_stages"] = enhanced_twitter_archive_clean[["doggo", "floofer", "pupper", "puppo"]].apply(lambda x:','.join(x.dropna().astype(str)), axis=1)


# drop columns
cols_to_drop = ["doggo", "floofer", "pupper", "puppo"]
drop_df_cols(enhanced_twitter_archive_clean, cols_to_drop)


# check all individual stages have been dropped and the dog_stage column was created
enhanced_twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2156 entries, 0 to 2355
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   tweet_id            2156 non-null   int64              
 1   timestamp           2156 non-null   datetime64[ns, UTC]
 2   rating_numerator    2156 non-null   int64              
 3   rating_denominator  2156 non-null   int64              
 4   name                2156 non-null   object             
 5   dog_stages          2156 non-null   object             
dtypes: datetime64[ns, UTC](1), int64(3), object(2)
memory usage: 117.9+ KB


# check for the value counts of the different dog stages captured
enhanced_twitter_archive_clean["dog_stages"].value_counts()

                 1812
pupper            224
doggo              75
puppo              24
doggo,pupper       10
floofer             9
doggo,puppo         1
doggo,floofer       1
Name: dog_stages, dtype: int64


enhanced_twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2156 entries, 0 to 2355
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   tweet_id            2156 non-null   int64              
 1   timestamp           2156 non-null   datetime64[ns, UTC]
 2   rating_numerator    2156 non-null   int64              
 3   rating_denominator  2156 non-null   int64              
 4   name                2156 non-null   object             
 5   dog_stages          2156 non-null   object             
dtypes: datetime64[ns, UTC](1), int64(3), object(2)
memory usage: 117.9+ KB


image_predictions_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1532 entries, 0 to 2073
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  1532 non-null   int64  
 1   img_num   1532 non-null   int64  
 2   p1        1532 non-null   object 
 3   p1_conf   1532 non-null   float64
 4   p1_dog    1532 non-null   bool   
 5   p2        1532 non-null   object 
 6   p2_conf   1532 non-null   float64
 7   p2_dog    1532 non-null   bool   
 8   p3        1532 non-null   object 
 9   p3_conf   1532 non-null   float64
 10  p3_dog    1532 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(3)
memory usage: 112.2+ KB


twitter_api_data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1771 entries, 0 to 1770
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   tweet_id             1771 non-null   int64
 1   tweet_retweeted      1771 non-null   bool 
 2   tweet_retweet_count  1771 non-null   int64
 3   tweet_fav_count      1771 non-null   int64
dtypes: bool(1), int64(3)
memory usage: 43.4 KB


# merge the twitter archived dataset with image prediction dataset
twitter_archive_first_merge = pd.merge(enhanced_twitter_archive_clean, image_predictions_clean, on=["tweet_id"])


# merge the first combined dataset with the api gathered dataset to form our master dataset
twitter_archive_master = pd.merge(twitter_archive_first_merge, twitter_api_data_clean, on=["tweet_id"])


# check the general information of the new master dataframe
twitter_archive_master.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1094 entries, 0 to 1093
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype              
---  ------               --------------  -----              
 0   tweet_id             1094 non-null   int64              
 1   timestamp            1094 non-null   datetime64[ns, UTC]
 2   rating_numerator     1094 non-null   int64              
 3   rating_denominator   1094 non-null   int64              
 4   name                 1094 non-null   object             
 5   dog_stages           1094 non-null   object             
 6   img_num              1094 non-null   int64              
 7   p1                   1094 non-null   object             
 8   p1_conf              1094 non-null   float64            
 9   p1_dog               1094 non-null   bool               
 10  p2                   1094 non-null   object             
 11  p2_conf              1094 non-null   float64            
 12  p2_dog               1094 non-null   bool               
 13  p3                   1094 non-null   object             
 14  p3_conf              1094 non-null   float64            
 15  p3_dog               1094 non-null   bool               
 16  tweet_retweeted      1094 non-null   bool               
 17  tweet_retweet_count  1094 non-null   int64              
 18  tweet_fav_count      1094 non-null   int64              
dtypes: bool(4), datetime64[ns, UTC](1), float64(3), int64(6), object(5)
memory usage: 141.0+ KB


# view sample of our new master dataset
twitter_archive_master.sample(5)


twitter_archive_master.to_csv("twitter_archive_master.csv", index=False)


twitter_archive_master.describe().style.format("{0:,.0f}")


# Check for number of observations in each dog stage
twitter_archive_master["dog_stages"].value_counts()

                 912
pupper           122
doggo             35
puppo             14
doggo,pupper       5
floofer            4
doggo,puppo        1
doggo,floofer      1
Name: dog_stages, dtype: int64


# visualize count of dog stages
twitter_archive_master["dog_stages"].value_counts().plot(
    kind="bar",
    title="Count of Dog Stages",
    xlabel="Dog Stage",
    ylabel="Number of Dog tweets"
);


#  which dog_stage has the highest likes on average
twitter_archive_master.groupby("dog_stages")["tweet_fav_count"].mean().sort_values().tail()

dog_stages
doggo,floofer    14842.000000
floofer          15153.500000
doggo            16831.485714
puppo            26090.714286
doggo,puppo      41912.000000
Name: tweet_fav_count, dtype: float64


# what name(s) is/are most popular among dogs?
twitter_archive_master["name"].value_counts()

Name Unavailable    288
Charlie               9
Cooper                8
Penny                 7
Daisy                 7
                   ... 
Chef                  1
Mauve                 1
Bones                 1
Milo                  1
Kloey                 1
Name: name, Length: 601, dtype: int64


# most predicted dog breed
twitter_archive_master["p1"].value_counts()

golden_retriever           99
pembroke                   66
labrador_retriever         63
chihuahua                  62
pug                        42
                           ..
groenendael                 1
toy_terrier                 1
wire-haired_fox_terrier     1
clumber                     1
black-and-tan_coonhound     1
Name: p1, Length: 111, dtype: int64


# plot a histogram of numerical variables in dataset
twitter_archive_master.hist(figsize=(15, 15));


# visualize the relationship between a dog's rating and the amount of likes the tweet gets.
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot([0, 50], [0, 130000], linestyle="--", color="red")
twitter_archive_master.plot("rating_numerator",
                            "tweet_fav_count",
                            kind="scatter",
                            color="#800080",
                            title="Relationship between likes and dog rating",
                            ax=ax,
                            alpha=0.9);
print(f"The coefficient of correlation between a dog's rating and the number of likes is:", round(twitter_archive_master["rating_numerator"].corr(twitter_archive_master["tweet_fav_count"]), 4))
print()

The coefficient of correlation between a dog's rating and the number of likes is: 0.3351

	tweet_id	jpg_url	img_num	p1	p1_conf	p1_dog	p2	p2_conf	p2_dog	p3	p3_conf	p3_dog
0	666020888022790149	https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg	1	Welsh_springer_spaniel	0.465074	True	collie	0.156665	True	Shetland_sheepdog	0.061428	True
1	666029285002620928	https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg	1	redbone	0.506826	True	miniature_pinscher	0.074192	True	Rhodesian_ridgeback	0.072010	True
2	666033412701032449	https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg	1	German_shepherd	0.596461	True	malinois	0.138584	True	bloodhound	0.116197	True
3	666044226329800704	https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg	1	Rhodesian_ridgeback	0.408143	True	redbone	0.360687	True	miniature_pinscher	0.222752	True
4	666049248165822465	https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg	1	miniature_pinscher	0.560311	True	Rottweiler	0.243682	True	Doberman	0.154629	True
...	...	...	...	...	...	...	...	...	...	...	...	...
2070	891327558926688256	https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg	2	basset	0.555712	True	English_springer	0.225770	True	German_short-haired_pointer	0.175219	True
2071	891689557279858688	https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg	1	paper_towel	0.170278	False	Labrador_retriever	0.168086	True	spatula	0.040836	False
2072	891815181378084864	https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg	1	Chihuahua	0.716012	True	malamute	0.078253	True	kelpie	0.031379	True
2073	892177421306343426	https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg	1	Chihuahua	0.323581	True	Pekinese	0.090647	True	papillon	0.068957	True
2074	892420643555336193	https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg	1	orange	0.097049	False	bagel	0.085851	False	banana	0.076110	False

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	retweeted_status_id	retweeted_status_user_id	rating_numerator	rating_denominator
count	2,356	78	78	181	181	2,356	2,356
mean	742,771,590,321,719,808	745,507,917,855,750,656	20,141,706,360,873,208	772,039,961,038,007,040	12,416,983,653,017,580	13	10
std	68,567,047,444,761,032	75,824,920,044,192,880	125,279,666,255,236,320	62,369,278,105,055,600	95,992,535,331,517,536	46	7
min	666,020,888,022,790,144	665,814,696,700,723,200	11,856,342	666,104,133,288,665,088	783,214	0	0
25%	678,398,938,214,475,776	675,741,911,993,464,832	308,637,449	718,631,497,683,582,976	4,196,983,835	10	10
50%	719,627,934,716,235,776	703,870,840,226,598,912	4,196,983,835	780,465,709,297,995,776	4,196,983,835	11	10
75%	799,337,304,954,252,288	825,780,371,286,566,912	4,196,983,835	820,314,633,777,061,888	4,196,983,835	12	10
max	892,420,643,555,336,192	886,266,357,075,128,320	840,547,864,354,918,400	887,473,957,103,951,872	787,461,778,435,289,088	1,776	170

	tweet_id	jpg_url	img_num	p1	p1_conf	p1_dog	p2	p2_conf	p2_dog	p3	p3_conf	p3_dog
1646	808501579447930884	https://pbs.twimg.com/media/Czhf4XtVQAAIqpd.jpg	2	Airedale	0.454239	True	cocker_spaniel	0.219323	True	Irish_terrier	0.093193	True
1211	742528092657332225	https://pbs.twimg.com/media/Ck39W0JWUAApgnH.jpg	2	sunglasses	0.900864	False	sunglass	0.040291	False	snorkel	0.009333	False
1029	711008018775851008	https://pbs.twimg.com/media/Cd4CBQFW8AAY3ND.jpg	1	French_bulldog	0.731405	True	Boston_bull	0.150672	True	pug	0.021811	True
57	667073648344346624	https://pbs.twimg.com/media/CUHr8WbWEAEBPgf.jpg	1	Chihuahua	0.483682	True	pug	0.092494	True	Brabancon_griffon	0.057495	True
958	705475953783398401	https://pbs.twimg.com/media/CcpaoR9WAAAKlJJ.jpg	1	golden_retriever	0.908784	True	Labrador_retriever	0.030361	True	tennis_ball	0.004996	False

	tweet_id	img_num	p1_conf	p2_conf	p3_conf
count	2,075	2,075	2,075	2,075	2,075
mean	738,451,357,852,539,008	1	1	0	0
std	67,852,033,330,235,656	1	0	0	0
min	666,020,888,022,790,144	1	0	0	0
25%	676,483,507,139,540,992	1	0	0	0
50%	711,998,809,858,043,904	1	1	0	0
75%	793,203,448,525,178,880	1	1	0	0
max	892,420,643,555,336,192	4	1	0	0

	tweet_id	tweet_retweeted	tweet_retweet_count	tweet_fav_count
142	863427515083354112	False	80	1961
1457	675898130735476737	False	505	1448
1351	679828447187857408	False	12607	33194
1557	673906403526995968	False	1438	2847
747	777621514455814149	False	2341	8223
1728	670449342516494336	False	570	1076
1686	671141549288370177	False	572	1024
1756	670046952931721218	False	152	576
1350	679844490799091713	False	687	2186
1421	676949632774234114	False	348	1188

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	timestamp	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo
786	774757898236878852	NaN	NaN	2016-09-10 23:54:11 +0000	<a href="http://twitter.com/download/iphone" r...	This is Finley. She's a Beneboop Cumbersplash....	NaN	NaN	NaN	https://twitter.com/dog_rates/status/774757898...	12	10	Finley	None	None	None	None
2198	668815180734689280	NaN	NaN	2015-11-23 15:35:39 +0000	<a href="http://twitter.com/download/iphone" r...	This is a wild Toblerone from Papua New Guinea...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/668815180...	7	10	a	None	None	None	None
1670	682406705142087680	NaN	NaN	2015-12-31 03:43:31 +0000	<a href="http://twitter.com/download/iphone" r...	This is Patrick. He's a bigass pupper. 7/10 ht...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/682406705...	7	10	Patrick	None	None	pupper	None

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	timestamp	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo
0	892420643555336193	NaN	NaN	2017-08-01 16:23:56 +0000	<a href="http://twitter.com/download/iphone" r...	This is Phineas. He's a mystical boy. Only eve...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/892420643...	13	10	Phineas	None	None	None	None
1	892177421306343426	NaN	NaN	2017-08-01 00:17:27 +0000	<a href="http://twitter.com/download/iphone" r...	This is Tilly. She's just checking pup on you....	NaN	NaN	NaN	https://twitter.com/dog_rates/status/892177421...	13	10	Tilly	None	None	None	None
2	891815181378084864	NaN	NaN	2017-07-31 00:18:03 +0000	<a href="http://twitter.com/download/iphone" r...	This is Archie. He is a rare Norwegian Pouncin...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/891815181...	12	10	Archie	None	None	None	None
3	891689557279858688	NaN	NaN	2017-07-30 15:58:51 +0000	<a href="http://twitter.com/download/iphone" r...	This is Darla. She commenced a snooze mid meal...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/891689557...	13	10	Darla	None	None	None	None
4	891327558926688256	NaN	NaN	2017-07-29 16:00:24 +0000	<a href="http://twitter.com/download/iphone" r...	This is Franklin. He would like you to stop ca...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/891327558...	12	10	Franklin	None	None	None	None
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2351	666049248165822465	NaN	NaN	2015-11-16 00:24:50 +0000	<a href="http://twitter.com/download/iphone" r...	Here we have a 1949 1st generation vulpix. Enj...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/666049248...	5	10	None	None	None	None	None
2352	666044226329800704	NaN	NaN	2015-11-16 00:04:52 +0000	<a href="http://twitter.com/download/iphone" r...	This is a purebred Piers Morgan. Loves to Netf...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/666044226...	6	10	a	None	None	None	None
2353	666033412701032449	NaN	NaN	2015-11-15 23:21:54 +0000	<a href="http://twitter.com/download/iphone" r...	Here is a very happy pup. Big fan of well-main...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/666033412...	9	10	a	None	None	None	None
2354	666029285002620928	NaN	NaN	2015-11-15 23:05:30 +0000	<a href="http://twitter.com/download/iphone" r...	This is a western brown Mitsubishi terrier. Up...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/666029285...	7	10	a	None	None	None	None
2355	666020888022790149	NaN	NaN	2015-11-15 22:32:08 +0000	<a href="http://twitter.com/download/iphone" r...	Here we have a Japanese Irish Setter. Lost eye...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/666020888...	8	10	None	None	None	None	None

	tweet_id	tweet_retweeted	tweet_retweet_count	tweet_fav_count
0	892420643555336193	False	7007	33809
1	892177421306343426	False	5301	29328
2	891815181378084864	False	3480	22048
3	891689557279858688	False	7226	36935
4	891327558926688256	False	7760	35310
...	...	...	...	...
1766	669753178989142016	False	341	709
1767	669749430875258880	False	55	229
1768	669684865554620416	False	75	448
1769	669683899023405056	False	94	339
1770	669682095984410625	False	116	312

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	timestamp	source	text	retweeted_status_id	retweeted_status_user_id	retweeted_status_timestamp	expanded_urls	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo
920	756303284449767430	NaN	NaN	2016-07-22 01:42:09 +0000	<a href="http://twitter.com/download/iphone" r...	Pwease accept dis rose on behalf of dog. 11/10...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/756303284...	11	10	None	None	None	None	None
1160	724004602748780546	NaN	NaN	2016-04-23 22:38:43 +0000	<a href="http://twitter.com/download/iphone" r...	This is Luther. He saw a ghost. Spooked af. 11...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/724004602...	11	10	Luther	None	None	pupper	None
1970	673295268553605120	NaN	NaN	2015-12-06 00:17:55 +0000	<a href="http://twitter.com/download/iphone" r...	Meet Eve. She's a raging alcoholic 8/10 (would...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/673295268...	8	10	Eve	None	None	pupper	None
1178	719551379208073216	NaN	NaN	2016-04-11 15:43:12 +0000	<a href="http://twitter.com/download/iphone" r...	This is Harnold. He accidentally opened the fr...	NaN	NaN	NaN	https://twitter.com/dog_rates/status/719551379...	10	10	Harnold	None	None	None	None

	tweet_id	tweet_retweet_count	tweet_fav_count
count	1,771	1,771	1,771
mean	751,366,856,863,915,392	2,731	7,941
std	73,096,232,396,284,832	4,109	11,354
min	669,682,095,984,410,624	1	0
25%	680,800,102,202,632,192	562	1,392
50%	709,901,256,215,666,688	1,362	3,116
75%	817,657,043,551,162,368	3,231	10,587
max	892,420,643,555,336,192	51,675	124,103

	tweet_id	in_reply_to_status_id	in_reply_to_user_id	retweeted_status_id	retweeted_status_user_id	rating_numerator	rating_denominator
count	2,336	73	73	180	180	2,336	2,336
mean	742,704,800,927,771,776	740,825,796,186,616,192	21,521,275,228,697,864	771,958,465,916,830,208	12,485,966,872,217,768	11	10
std	68,593,374,754,350,128	74,195,613,216,839,824	129,440,420,764,057,680	62,533,586,713,814,240	96,255,798,436,530,464	2	1
min	666,020,888,022,790,144	665,814,696,700,723,200	11,856,342	666,104,133,288,665,088	783,214	0	2
25%	678,386,830,676,510,720	675,497,103,322,386,432	1,198,988,510	717,175,879,759,181,824	4,196,983,835	10	10
50%	719,627,934,716,235,776	703,041,949,650,034,688	4,196,983,835	780,150,020,947,441,664	4,196,983,835	11	10
75%	799,300,023,567,684,608	813,127,251,579,564,032	4,196,983,835	820,423,404,544,218,112	4,196,983,835	12	10
max	892,420,643,555,336,192	886,266,357,075,128,320	840,547,864,354,918,400	887,473,957,103,951,872	787,461,778,435,289,088	44	40

	tweet_id	timestamp	rating_numerator	rating_denominator	name	doggo	floofer	pupper	puppo
0	892420643555336193	2017-08-01 16:23:56+00:00	13	10	Phineas	None	None	None	None
1	892177421306343426	2017-08-01 00:17:27+00:00	13	10	Tilly	None	None	None	None

	tweet_id	timestamp	rating_numerator	rating_denominator	name	dog_stages	img_num	p1	p1_conf	p1_dog	p2	p2_conf	p2_dog	p3	p3_conf	p3_dog	tweet_retweeted	tweet_retweet_count	tweet_fav_count
185	835297930240217089	2017-02-25 01:18:40+00:00	12	10	Ash		1	rottweiler	0.341276	True	border_terrier	0.336220	True	gordon_setter	0.045448	True	False	2747	15450
207	831322785565769729	2017-02-14 02:02:51+00:00	12	10	Pete	doggo	1	old_english_sheepdog	0.999715	True	tibetan_terrier	0.000046	True	guinea_pig	0.000041	False	False	1447	8757
251	821886076407029760	2017-01-19 01:04:45+00:00	13	10	Jimison		1	golden_retriever	0.266238	True	cocker_spaniel	0.223325	True	irish_setter	0.151631	True	False	2142	10681
529	709852847387627521	2016-03-15 21:24:41+00:00	12	10	Name Unavailable		2	chihuahua	0.945629	True	pomeranian	0.019204	True	west_highland_white_terrier	0.010134	True	False	1092	3244
726	688547210804498433	2016-01-17 02:23:42+00:00	9	10	Frönq		1	papillon	0.531279	True	blenheim_spaniel	0.214197	True	border_collie	0.053840	True	False	627	2405

	tweet_id	rating_numerator	rating_denominator	img_num	p1_conf	p2_conf	p3_conf	tweet_retweet_count	tweet_fav_count
count	1,094	1,094	1,094	1,094	1,094	1,094	1,094	1,094	1,094
mean	750,446,822,646,363,136	11	10	1	1	0	0	2,483	9,229
std	74,050,123,614,936,336	2	1	1	0	0	0	3,772	11,867
min	669,683,899,023,405,056	2	7	1	0	0	0	35	215
25%	680,810,404,888,595,456	10	10	1	0	0	0	590	2,024
50%	708,125,206,227,873,792	11	10	1	1	0	0	1,352	5,018
75%	817,158,712,309,808,128	12	10	1	1	0	0	2,998	12,457
max	892,177,421,306,343,424	44	40	4	1	0	0	51,675	124,103

Project: Wrangling and Analyze Data¶

Data Gathering¶

Assessing Data¶

Quality issues¶

Tidiness issues¶

Cleaning Data¶

Issue #1:¶

Define:¶

Code¶

Test¶

Issue #2:¶

Define¶

Code¶

Test¶

Issue #3:¶

Define¶

Code¶

Test¶

Issue #4:¶

Define¶

Code¶

Test¶

Issue #5:¶

Define¶

Code¶

Test¶

Issue #6:¶

Define¶

Code¶

Test¶

Issue #7:¶

Define¶

Code¶

Test¶

Issue #8:¶

Define¶

Code¶

Test¶

Issue #9:¶

Define¶

Code¶

Test¶

Issue #10:¶

Define¶

Code¶

Test¶

Issue #11:¶

Define¶

Code¶

Test¶

Storing Data¶

Analyzing and Visualizing Data¶

Insights:¶

Visualization¶

Resources¶