import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial import distance
import difflib
import time
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
%matplotlib inline
Overview
In this notebook, we will be using cosine similarity to produce songs recommendation. We will be using data from Spotify to cluster the songs into different song types and see what kinds of songs have the same attributes.
We will use the k-means algorithm to sort the songs into different types. We will then use the clustered database to create a recommendation system and make a few recommendations.
By the end of this notebook you will be able to find similar songs to your favorite song, and hopefully, find new favorite songs :)
We’ll use a dataset from Kaggle: https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs
The dataset contains audio features for over 1.2 million songs, obtained with the Spotify API.
Reference for these audio features can be found here: https://developer.spotify.com/documentation/web-api/reference/#/operations/get-audio-features
Let’s go!
First, we’ll import some important packages:
Now we can load the data:
= pd.read_csv("tracks_features.csv")
df
df.head()
id | name | album | album_id | artists | artist_ids | track_number | disc_number | explicit | danceability | ... | speechiness | acousticness | instrumentalness | liveness | valence | tempo | duration_ms | time_signature | year | release_date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7lmeHLHBe4nmXzuXc0HDjk | Testify | The Battle Of Los Angeles | 2eia0myWFgoHuttJytCxgX | ['Rage Against The Machine'] | ['2d0hyoQ5ynDBnkvAbJKORj'] | 1 | 1 | False | 0.470 | ... | 0.0727 | 0.02610 | 0.000011 | 0.3560 | 0.503 | 117.906 | 210133 | 4.0 | 1999 | 1999-11-02 |
1 | 1wsRitfRRtWyEapl0q22o8 | Guerrilla Radio | The Battle Of Los Angeles | 2eia0myWFgoHuttJytCxgX | ['Rage Against The Machine'] | ['2d0hyoQ5ynDBnkvAbJKORj'] | 2 | 1 | True | 0.599 | ... | 0.1880 | 0.01290 | 0.000071 | 0.1550 | 0.489 | 103.680 | 206200 | 4.0 | 1999 | 1999-11-02 |
2 | 1hR0fIFK2qRG3f3RF70pb7 | Calm Like a Bomb | The Battle Of Los Angeles | 2eia0myWFgoHuttJytCxgX | ['Rage Against The Machine'] | ['2d0hyoQ5ynDBnkvAbJKORj'] | 3 | 1 | False | 0.315 | ... | 0.4830 | 0.02340 | 0.000002 | 0.1220 | 0.370 | 149.749 | 298893 | 4.0 | 1999 | 1999-11-02 |
3 | 2lbASgTSoDO7MTuLAXlTW0 | Mic Check | The Battle Of Los Angeles | 2eia0myWFgoHuttJytCxgX | ['Rage Against The Machine'] | ['2d0hyoQ5ynDBnkvAbJKORj'] | 4 | 1 | True | 0.440 | ... | 0.2370 | 0.16300 | 0.000004 | 0.1210 | 0.574 | 96.752 | 213640 | 4.0 | 1999 | 1999-11-02 |
4 | 1MQTmpYOZ6fcMQc56Hdo7T | Sleep Now In the Fire | The Battle Of Los Angeles | 2eia0myWFgoHuttJytCxgX | ['Rage Against The Machine'] | ['2d0hyoQ5ynDBnkvAbJKORj'] | 5 | 1 | False | 0.426 | ... | 0.0701 | 0.00162 | 0.105000 | 0.0789 | 0.539 | 127.059 | 205600 | 4.0 | 1999 | 1999-11-02 |
5 rows × 24 columns
We’ll convert the “explicit” and “release_date” columns to a numerical value:
'explicit'] = df['explicit'].astype('int')
df[
'release_date'] = pd.to_datetime(df['release_date'], errors = 'coerce').astype('int64') df[
and now create years cuts (we want our system to recommend songs from the same decade)
'year'].mask(df['year'] <= 1910 ,1 , inplace=True)
df['year'].mask(df['year'].between(1911, 1920) ,2 , inplace=True)
df['year'].mask(df['year'].between(1921, 1930) ,3 , inplace=True)
df['year'].mask(df['year'].between(1931, 1940) ,4 , inplace=True)
df['year'].mask(df['year'].between(1941, 1950) ,5 , inplace=True)
df['year'].mask(df['year'].between(1951, 1960) ,6 , inplace=True)
df['year'].mask(df['year'].between(1961, 1970) ,7 , inplace=True)
df['year'].mask(df['year'].between(1971, 1980) ,8 , inplace=True)
df['year'].mask(df['year'].between(1981, 1990) ,9 , inplace=True)
df['year'].mask(df['year'].between(1991, 2000) ,10 , inplace=True)
df['year'].mask(df['year'].between(2001, 2010) ,11 , inplace=True)
df['year'].mask(df['year'].between(2011, 2020) ,12 , inplace=True)
df['year'].mask(df['year'].between(2021, 2030) ,13 , inplace=True) df[
Let’s normalize the numerical columns. Why do we have to? Because we want the distances between the different features to be even, so all the features will be equal.
= ['acousticness', 'danceability', 'duration_ms',
cols_to_normalize 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness',
'tempo', 'valence', 'time_signature', 'year', 'release_date']
= StandardScaler()
scaler = scaler.fit_transform(df[cols_to_normalize]) df[cols_to_normalize]
We’ll drop unnecessary columns (mainly textual columns)
=['id', 'album', 'album_id', 'artist_ids', 'track_number', 'disc_number', 'key'], inplace=True)
df.drop(columns2) df.head(
name | artists | explicit | danceability | energy | loudness | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | duration_ms | time_signature | year | release_date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Testify | ['Rage Against The Machine'] | 0 | -0.121562 | 1.589717 | 0.918016 | 1 | -0.100716 | -1.092029 | -0.751691 | 0.855599 | 0.277330 | 0.008781 | -0.238621 | 0.298487 | -1.082527 | -0.734249 |
1 | Guerrilla Radio | ['Rage Against The Machine'] | 1 | 0.558569 | 1.518454 | 0.865739 | 1 | 0.893324 | -1.126297 | -0.751533 | -0.258227 | 0.225571 | -0.451057 | -0.262868 | 0.298487 | -1.082527 | -0.734249 |
We can now gather the relevant columns for clustering
= df[['explicit', 'danceability', 'energy', 'loudness', 'mode',
X 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
'valence', 'tempo', 'duration_ms', 'time_signature', 'release_date', 'year']]
How many clusters should we use? We’ll check with the elbow method
#creating a list of inertia scores might take a while
# inertia = []
# for n in range(1,20):
# kmeans = KMeans(n_clusters = n, random_state=7)
# kmeans.fit(X)
# inertia.append(kmeans.inertia_)
# inertia
#creating a line graph of the inertia scores
# plt.figure(figsize = (12,8))
# plt.plot(range(1,20),inertia)
# plt.title('Inertia scores')
# plt.show()
This isn’t enough. We should also look at the silhouette scores.
#creating a list of silhouette scores
# from sklearn.metrics import silhouette_score
# silhouette = []
# for n in range(2,20):
# kmeans = KMeans(n_clusters = n, random_state=7)
# kmeans.fit(X)
# score = silhouette_score(X,kmeans.labels_)
# silhouette.append(score)
# silhouette
# ploting the silhouette scores
# plt.figure(figsize = (12,8))
# plt.plot(range(2,20),silhouette)
# plt.title('silhouette scores')
We can now cluster to 18 clusters.
# clustering to 18 (styles)
from sklearn.cluster import KMeans
= df[['explicit', 'danceability', 'energy', 'loudness', 'mode',
X 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
'valence', 'tempo', 'duration_ms', 'time_signature', 'release_date', 'year']]
= KMeans(n_clusters=18)
km 'cluster'] = km.fit_predict(X)
df[
df.head()
c:\Users\nisan\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
warnings.warn(
name | artists | explicit | danceability | energy | loudness | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | duration_ms | time_signature | year | release_date | cluster | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Testify | ['Rage Against The Machine'] | 0 | -0.121562 | 1.589717 | 0.918016 | 1 | -0.100716 | -1.092029 | -0.751691 | 0.855599 | 0.277330 | 0.008781 | -0.238621 | 0.298487 | -1.082527 | -0.734249 | 1 |
1 | Guerrilla Radio | ['Rage Against The Machine'] | 1 | 0.558569 | 1.518454 | 0.865739 | 1 | 0.893324 | -1.126297 | -0.751533 | -0.258227 | 0.225571 | -0.451057 | -0.262868 | 0.298487 | -1.082527 | -0.734249 | 1 |
2 | Calm Like a Bomb | ['Rage Against The Machine'] | 0 | -0.938773 | 1.562569 | 0.914435 | 1 | 3.436618 | -1.099039 | -0.751715 | -0.441094 | -0.214381 | 1.038065 | 0.308569 | 0.298487 | -1.082527 | -0.734249 | 2 |
3 | Mic Check | ['Rage Against The Machine'] | 1 | -0.279732 | 1.552388 | 0.856287 | 0 | 1.315769 | -0.736631 | -0.751711 | -0.446636 | 0.539822 | -0.674995 | -0.217001 | 0.298487 | -1.082527 | -0.734249 | 1 |
4 | Sleep Now In the Fire | ['Rage Against The Machine'] | 0 | -0.353544 | 1.423437 | 0.727529 | 1 | -0.123132 | -1.155581 | -0.472676 | -0.679930 | 0.410424 | 0.304640 | -0.266567 | 0.298487 | -1.082527 | -0.734249 | 1 |
We can also one-hot encode the cluster column:
= OneHotEncoder(sparse=False, handle_unknown="ignore")
encoder = pd.DataFrame(encoder.fit_transform(np.array(df["cluster"]).reshape(-1,1)))
enc = df["cluster"].unique()
enc.columns
= enc
df[enc.columns]
df.head()
c:\Users\nisan\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\preprocessing\_encoders.py:828: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.
warnings.warn(
name | artists | explicit | danceability | energy | loudness | mode | speechiness | acousticness | instrumentalness | ... | 17 | 8 | 9 | 7 | 15 | 5 | 3 | 0 | 13 | 16 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Testify | ['Rage Against The Machine'] | 0 | -0.121562 | 1.589717 | 0.918016 | 1 | -0.100716 | -1.092029 | -0.751691 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | Guerrilla Radio | ['Rage Against The Machine'] | 1 | 0.558569 | 1.518454 | 0.865739 | 1 | 0.893324 | -1.126297 | -0.751533 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | Calm Like a Bomb | ['Rage Against The Machine'] | 0 | -0.938773 | 1.562569 | 0.914435 | 1 | 3.436618 | -1.099039 | -0.751715 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | Mic Check | ['Rage Against The Machine'] | 1 | -0.279732 | 1.552388 | 0.856287 | 0 | 1.315769 | -0.736631 | -0.751711 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | Sleep Now In the Fire | ['Rage Against The Machine'] | 0 | -0.353544 | 1.423437 | 0.727529 | 1 | -0.123132 | -1.155581 | -0.472676 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 36 columns
Let’s sort the dataframe by release date and drop duplicate song names (keep the first edition).
=['release_date'], inplace=True)
df.sort_values(by=['name', 'artists'], inplace=True) df.drop_duplicates(subset
We should also clean the artists’ column of some symbols
"artists"] = df["artists"].str.replace("[","")
df["artists"] = df["artists"].str.replace("]","")
df["artists"] = df["artists"].str.replace("'","")
df["artists"] = df["artists"].str.replace("'","") df[
C:\Users\nisan\AppData\Local\Temp\ipykernel_1540\4026273699.py:1: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
df["artists"] = df["artists"].str.replace("[","")
C:\Users\nisan\AppData\Local\Temp\ipykernel_1540\4026273699.py:2: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
df["artists"] = df["artists"].str.replace("]","")
At this stage, you can choose to use PCA to speed up the results.
# from sklearn.decomposition import PCA
# pca = PCA(n_components = 3)
# x = data.drop(columns=['name'])
# x = pca.fit_transform(x)
# explained_variance = pca.explained_variance_ratio_
# print(explained_variance)
# dataset_pca = pd.DataFrame({'pca1': x[:, 0], 'pca2': x[:, 1], 'pca3': x[:, 2], 'name': data['name'], 'cluster': data['cluster']})
# dataset_pca.head()
Let’s create a function that finds similar songs using Cosine Similarity. You can replace it with any other relevant function from: https://docs.scipy.org/doc/scipy/reference/spatial.distance.html
def find_similar_songs(best_match, artist):
= df.index[(df['name'] == best_match) & (df['artists'] == artist)].values
found_song_idx
= df['cluster'].loc[found_song_idx].values[0]
cluster_data
#filter to the relevant cluster only
= df
x
= x[x['cluster']==cluster_data]
x
#store the names of the songs
=x['name'].values
song_names= x['artists'].values
artists_names
#drop the categorial column
= x.drop(columns=['name', 'cluster', 'artists'])
x
#create a list that will store all the cosine similarities
= []
lst
#add a counter
= 0
count
#iterate over the dataframe and compute all the similarities
for i in x.values:
0], i), count])
lst.append([distance.cosine(x.loc[found_song_idx].values[+= 1
count
#get top songs names from the list
lst.sort()= []
recs for i in range(1,6):
1]], artists_names[lst[i][1]]])
recs.append([song_names[lst[i][
= pd.DataFrame(recs, columns =['Similar Song', 'Artist'])
recs_df
print("\nHere are songs similar to", best_match)
print("*************************************************")
return print(recs_df)
Our similarity function needs a song and an artist. Let’s create another function that will get a song’s name and guide the user to find the relevant artist from our database.
def get_song_and_artist():
= input('Please enter a song that you like:')
song = difflib.get_close_matches(song, df['name'].values, n=1)[0]
best_match
= df.index[df['name'] == best_match].values
found_song_idx = df['artists'].loc[df['name'] == best_match].tolist()
artist_lst
print('\nFound the following artists: ')
print(artist_lst)
= input('\nWho is the artist? Please choose from the list:')
artist return best_match, artist
We can try it out:
= get_song_and_artist()
best_match, artist find_similar_songs(best_match, artist)
Found the following artists:
['Europe', '"Pickin On Series"', '"Scott Bradlees Postmodern Jukebox", Gunhild Carling']
Here are songs similar to The Final Countdown
*************************************************
Similar Song Artist
0 On and On Triumph
1 Inoiz Itoiz
2 Outlaw Brighton Rock
3 Kissin' Dynamite AC/DC
4 The Walk - Remastered Version Eurythmics, Annie Lennox, Dave Stewart
Here’s a short demo: