import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel
df = pd.read_csv("D:/ML & AI EDU/Tensorflow Lectures Udemy/titles.csv")
print("Dataset loaded...")
Dataset loaded...
df.head()
id | title | type | description | release_year | age_certification | runtime | genres | production_countries | seasons | imdb_id | imdb_score | imdb_votes | tmdb_popularity | tmdb_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ts300399 | Five Came Back: The Reference Films | SHOW | This collection includes 12 World War II-era p... | 1945 | TV-MA | 48 | ['documentation'] | ['US'] | 1.0 | NaN | NaN | NaN | 0.600 | NaN |
1 | tm84618 | Taxi Driver | MOVIE | A mentally unstable Vietnam War veteran works ... | 1976 | R | 113 | ['crime', 'drama'] | ['US'] | NaN | tt0075314 | 8.3 | 795222.0 | 27.612 | 8.2 |
2 | tm127384 | Monty Python and the Holy Grail | MOVIE | King Arthur, accompanied by his squire, recrui... | 1975 | PG | 91 | ['comedy', 'fantasy'] | ['GB'] | NaN | tt0071853 | 8.2 | 530877.0 | 18.216 | 7.8 |
3 | tm70993 | Life of Brian | MOVIE | Brian Cohen is an average young Jewish man, bu... | 1979 | R | 94 | ['comedy'] | ['GB'] | NaN | tt0079470 | 8.0 | 392419.0 | 17.505 | 7.8 |
4 | tm190788 | The Exorcist | MOVIE | 12-year-old Regan MacNeil begins to adapt an e... | 1973 | R | 133 | ['horror'] | ['US'] | NaN | tt0070047 | 8.1 | 391942.0 | 95.337 | 7.7 |
df.shape
(5806, 15)
df.columns
Index(['id', 'title', 'type', 'description', 'release_year', 'age_certification', 'runtime', 'genres', 'production_countries', 'seasons', 'imdb_id', 'imdb_score', 'imdb_votes', 'tmdb_popularity', 'tmdb_score'], dtype='object')
df.isnull().sum()
id 0 title 1 type 0 description 18 release_year 0 age_certification 2610 runtime 0 genres 0 production_countries 0 seasons 3759 imdb_id 444 imdb_score 523 imdb_votes 539 tmdb_popularity 94 tmdb_score 318 dtype: int64
df = df.fillna(df.mean())
C:\Users\Sanchit\AppData\Local\Temp/ipykernel_15676/114435927.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction. df = df.fillna(df.mean())
df.isnull().sum()
id 0 title 1 type 0 description 18 release_year 0 age_certification 2610 runtime 0 genres 0 production_countries 0 seasons 0 imdb_id 444 imdb_score 0 imdb_votes 0 tmdb_popularity 0 tmdb_score 0 dtype: int64
df.duplicated()
0 False 1 False 2 False 3 False 4 False ... 5801 False 5802 False 5803 False 5804 False 5805 False Length: 5806, dtype: bool
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5806 entries, 0 to 5805 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 5806 non-null object 1 title 5805 non-null object 2 type 5806 non-null object 3 description 5788 non-null object 4 release_year 5806 non-null int64 5 age_certification 3196 non-null object 6 runtime 5806 non-null int64 7 genres 5806 non-null object 8 production_countries 5806 non-null object 9 seasons 5806 non-null float64 10 imdb_id 5362 non-null object 11 imdb_score 5806 non-null float64 12 imdb_votes 5806 non-null float64 13 tmdb_popularity 5806 non-null float64 14 tmdb_score 5806 non-null float64 dtypes: float64(5), int64(2), object(8) memory usage: 680.5+ KB
df['word_count'] = df['description'].apply(lambda x: len(str(x).split()))# Plotting the word count
df['word_count'].plot(
kind='hist',
bins = 50,
figsize = (12,8),title='Word Count Distribution for book descriptions')
<AxesSubplot:title={'center':'Word Count Distribution for book descriptions'}, ylabel='Frequency'>
#Converting text descriptions into vectors using TF-IDF using Bigram
tf = TfidfVectorizer(ngram_range=(2, 2), stop_words='english', lowercase = False)
tfidf_matrix = tf.fit_transform(df['genres'])
total_words = tfidf_matrix.sum(axis=0)
#Finding the word frequency
freq = [(word, total_words[0, idx]) for word, idx in tf.vocabulary_.items()]
freq =sorted(freq, key = lambda x: x[1], reverse=True)
#converting into dataframe
bigram = pd.DataFrame(freq)
bigram.rename(columns = {0:'bigram', 1: 'count'}, inplace = True)
#Taking first 20 records
bigram = bigram.head(20)
#Plotting the bigram distribution
bigram.plot(x ='bigram', y='count', kind = 'bar', title = "Bigram disribution for the top 20 words in the book description", figsize = (15,7), )
<AxesSubplot:title={'center':'Bigram disribution for the top 20 words in the book description'}, xlabel='bigram'>
df[df['release_year'] > 2000]
id | title | type | description | release_year | age_certification | runtime | genres | production_countries | seasons | imdb_id | imdb_score | imdb_votes | tmdb_popularity | tmdb_score | word_count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
243 | ts4 | Breaking Bad | SHOW | When Walter White, a New Mexico chemistry teac... | 2008 | TV-MA | 48 | ['drama', 'thriller', 'crime'] | ['US'] | 5.000000 | tt0903747 | 9.500000 | 1.727694e+06 | 337.419 | 8.800000 | 56 |
244 | ts9 | The Walking Dead | SHOW | Sheriff's deputy Rick Grimes awakens from a co... | 2010 | TV-MA | 46 | ['action', 'drama', 'scifi', 'thriller', 'horr... | ['US'] | 11.000000 | tt1520211 | 8.200000 | 9.451250e+05 | 773.190 | 8.100000 | 32 |
245 | ts26091 | The Staircase | SHOW | Academy Award-winning documentary filmmaker, J... | 2004 | TV-MA | 49 | ['crime', 'documentation', 'drama'] | ['FR'] | 2.000000 | tt0388644 | 7.800000 | 2.153100e+04 | 14.185 | 7.700000 | 95 |
246 | ts11 | Downton Abbey | SHOW | A chronicle of the lives of the aristocratic C... | 2010 | TV-14 | 58 | ['drama', 'romance', 'european'] | ['GB'] | 6.000000 | tt1606375 | 8.700000 | 1.977440e+05 | 57.029 | 8.100000 | 33 |
247 | ts21469 | Grey's Anatomy | SHOW | Follows the personal and professional lives of... | 2005 | TV-14 | 49 | ['drama', 'romance'] | ['US'] | 18.000000 | tt0413573 | 7.600000 | 2.936180e+05 | 1215.393 | 8.300000 | 17 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
5801 | tm1014599 | Fine Wine | MOVIE | A beautiful love story that can happen between... | 2021 | NaN | 100 | ['romance', 'drama'] | ['NG'] | 2.165608 | tt13857480 | 6.900000 | 3.900000e+01 | 0.966 | 6.818039 | 15 |
5802 | tm1108171 | Edis Starlight | MOVIE | Rising star Edis's career journey with ups and... | 2021 | NaN | 74 | ['music', 'documentation'] | [] | 2.165608 | NaN | 6.533447 | 2.340719e+04 | 1.036 | 8.500000 | 9 |
5803 | tm1045018 | Clash | MOVIE | A man from Nigeria returns to his family in Ca... | 2021 | NaN | 88 | ['family', 'drama'] | ['NG', 'CA'] | 2.165608 | tt14620732 | 6.500000 | 3.200000e+01 | 0.709 | 6.818039 | 26 |
5804 | tm1098060 | Shadow Parties | MOVIE | A family faces destruction in a long-running c... | 2021 | NaN | 116 | ['action', 'thriller'] | [] | 2.165608 | tt10168094 | 6.200000 | 9.000000e+00 | 2.186 | 6.818039 | 20 |
5805 | ts271048 | Mighty Little Bheem: Kite Festival | SHOW | With winter behind them, Bheem and his townspe... | 2021 | NaN | 0 | ['family', 'comedy', 'animation'] | [] | 1.000000 | tt13711094 | 8.800000 | 1.600000e+01 | 0.979 | 10.000000 | 24 |
5563 rows × 16 columns
df[df['type'] == 'SHOW']['release_year']
0 1945 5 1969 29 1972 47 1989 55 1982 ... 5793 2021 5794 2021 5796 2021 5799 2021 5805 2021 Name: release_year, Length: 2047, dtype: int64
All tvshows with their realease years
df[(df['type']=='SHOW') & (df['age_certification']=='TV-MA')][['title','runtime']] \
.sort_values(by=['runtime'],ascending=False)
title | runtime | |
---|---|---|
3268 | The Hateful Eight: Extended Version | 199 |
3828 | 1994 | 178 |
388 | Dead Set | 141 |
2390 | The Yard | 108 |
1476 | Intersection | 97 |
... | ... | ... |
4008 | The Forest of Love: Deep Cut | 0 |
5213 | Savage Beauty | 0 |
4776 | I'm with the Band: Nasty Cherry | 0 |
896 | Masameer | 0 |
5612 | Metal Shop Masters | 0 |
841 rows × 2 columns
df[df['type'] == "MOVIE"]['release_year'].hist()
plt.show()
maximum movies realesed between 2015 to 2021 on netflix
df[df['seasons'] >3]['type'].value_counts()
SHOW 305 Name: type, dtype: int64
there are 305 tv shows having more then 3 seasons
we choose description feature for finding similarities in content
df.head(1)['description']
0 This collection includes 12 World War II-era p... Name: description, dtype: object
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3),strip_accents= "unicode" ,min_df=3, stop_words='english')
df['description'] = df['description'].fillna("")
tfidf_matrix = tf.fit_transform(df['description'])
tfidf_matrix
<5806x9285 sparse matrix of type '<class 'numpy.float64'>' with 110729 stored elements in Compressed Sparse Row format>
tfidf_matrix.shape
(5806, 9285)
sig = sigmoid_kernel(tfidf_matrix,tfidf_matrix)
sig[0]
array([0.76163938, 0.76159546, 0.76159416, ..., 0.76159416, 0.76159416, 0.76159416])
indices= pd.Series(df.index,index=df['title']).drop_duplicates()
indices
title Five Came Back: The Reference Films 0 Taxi Driver 1 Monty Python and the Holy Grail 2 Life of Brian 3 The Exorcist 4 ... Fine Wine 5801 Edis Starlight 5802 Clash 5803 Shadow Parties 5804 Mighty Little Bheem: Kite Festival 5805 Length: 5806, dtype: int64
def give_rec(title, sig=sig):
idx = indices[title]
sig_scores = list(enumerate(sig[idx]))
sig_scores = sorted(sig_scores,key=lambda x: x[1],reverse = True)
sig_scores = sig_scores[1:11]
movie_indices = [i[0] for i in sig_scores]
return df['title'].iloc[movie_indices]
give_rec('Fine Wine')
673 Fida 4692 Soulmate 5565 Most Eligible Bachelor 450 Jodhaa Akbar 1820 Love Is a Story 3150 Ascharya Fuck It 2979 Solo el amor 5661 Making Malinche: A Documentary by Nacho Cano 2783 Love.com 5394 Why Are You Like This? Name: title, dtype: object
we recommend movies to user based on similarities in description
sigmoid method is used to find similarities
extract some useful information