#NLTK library for natural language processing
import nltk
nltk.download('vader_lexicon')
from nltk.tokenize import word_tokenize
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
# Sentiment Analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from senticnet.senticnet import SenticNet
#Dataset management/data handling
import pandas as pd
import numpy as np #for calculation
import string
import re
#Visualization
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
# Suppress warnings
import warnings
warnings.filterwarnings("ignore")
# Initialize SenticNet and VADER Sentiment Analyzer
sn = SenticNet()
vader = SentimentIntensityAnalyzer()
import os #for user convenience
# Define the base directory
######### Change based on your path here ##########
base_dir = r'Song_csv'
# List of artist names
artists = [
'ArianaGrande', 'Beyonce', 'BillieEilish', 'CardiB', 'CharliePuth',
'ColdPlay', 'Drake', 'DuaLipa', 'EdSheeran', 'Eminem', 'JustinBieber',
'KatyPerry', 'Khalid', 'LadyGaga', 'Maroon5', 'NickiMinaj',
'PostMalone', 'Rihanna', 'SelenaGomez', 'TaylorSwift'
]
# Build the list of file paths
file_paths = [os.path.join(base_dir, f"{artist}.csv") for artist in artists]
# Print file paths (optional)
for file in file_paths:
print(f"File path: {file}")
# Read CSV files into DataFrame list
df_all = [pd.read_csv(file) for file in file_paths]
####### Data preparation ###########
# Preprocessing function
# Remove any song that are not the original version of an artist's songs
def cleaning(df):
a=[]
i=0
df1=df
title = df['Title']
for t in df['Title']:
r=Re=l=Li=c=m=V=ve=D=rs=edit=Edit=0
r=t.find('remix')
Re=t.find('Remix')
l=t.find('live')
Li=t.find('Live')
V=t.find('Version')
ve=t.find('version')
D=t.find('Demo ')
D=t.find('Demo')
rs=t.find('Reprise')
c=t.find('COPY')
m=t.find('Mix')
edit=t.find('edit')
Edit=t.find('Edit')
if r != -1:
a.append(t)
elif Re != -1:
a.append(t)
elif l != -1:
a.append(t)
elif Li != -1:
a.append(t)
elif V != -1:
a.append(t)
elif ve != -1:
a.append(t)
elif D != -1:
a.append(t)
elif rs != -1:
a.append(t)
elif c != -1:
a.append(t)
elif m != -1:
a.append(t)
elif edit != -1:
a.append(t)
elif Edit != -1:
a.append(t)
for t1 in df['Title']:
for t2 in a:
if t1 == t2:
df1=df1.drop(i)
i=i+1
df1.dropna(subset = ["Title"], inplace=True)
df1.dropna(subset = ["Lyric"], inplace=True)
df1.drop_duplicates(subset ="Title",keep = False, inplace = True)
df1.drop_duplicates(subset ="Lyric",keep = False, inplace = True)
return df1
# Change lyrics into word_list that is easier to be process during analysis
# such as, making all lyrics in lowercase to make lyrics not case-sensitive
# remove stop word to de-clutter the dataframe
# using lemmatazation to only use base word
def lyrics_to_words(document):
if not isinstance(document, str):
return "" # Handle non-string input
stop_words = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
stopwordremoval = " ".join([i for i in document.lower().split() if i not in stop_words])
punctuationremoval = ''.join(ch for ch in stopwordremoval if ch not in exclude)
normalized = " ".join(lemma.lemmatize(word) for word in punctuationremoval.split())
return normalized
# Extract unique words from lyric in a song
def uniqueWords(df):
def unique(list1):
return list(set(list1)) # Simplified using set
words = []
for word in df['Lyric']:
processed_lyric = lyrics_to_words(word)
words.append(unique(processed_lyric.split()))
df['words'] = words
return df
# Main pipeline
# dataframe that is clean
df_allclean = []
for df in df_all:
df_cleaned = cleaning(df)
df_word_unique = uniqueWords(df_cleaned)
df_allclean.append(df_word_unique)
# df_main is the dataframe that is cleaned (df_allclean)
frames = df_allclean
df_main = pd.concat(frames,ignore_index=True)
df_main.drop_duplicates(subset=['Title', 'Lyric'], inplace=True)
# Remove any song that exist outside the span of 1900 and 2024
df_main['Year'] = pd.to_numeric(df_main['Year'], errors='coerce') # Ensure all values are numeric
df_main['Year'] = df_main['Year'].fillna(0) # Replace NaN with 0
df_main['Year'] = df_main['Year'].astype(int) # Convert to integer
df_main = df_main[(df_main['Year'] >= 1900) & (df_main['Year'] <= 2024)]
# remove the index column (unnamed) from csv as it is not needed
for col in ['Unnamed: 0', 'Unnamed: 0.1']:
if col in df_main.columns:
df_main = df_main.drop(col, axis=1)
df_main=df_main.reset_index(drop=True)
# Define a function to handle contractions
# Here, word like i'm is not considered as I am but iam, this is to reduce the amout of word for each lyrics
# and not to stray far from the original form of the song
def expand_contractions(text):
contractions = {
"i'm": "im", "i've": "ive", "i'll": "ill", "i'd": "id", "you're": "youre", "you've": "youve", "you'll": "youll",
"he's": "hes", "she's": "shes", "it's": "its", "they're": "theyre", "they've": "theyve", "we're": "were",
"we've": "weve", "we'll": "well", "they'll": "theyll", "isn't": "isnt", "aren't": "arent", "wasn't": "wasnt",
"weren't": "werent", "hasn't": "hasnt", "haven't": "havent", "hadn't": "hadnt", "won't": "wont", "wouldn't": "wouldnt",
"shouldn't": "shouldnt", "can't": "cant", "couldn't": "couldnt", "don't": "dont", "doesn't": "doesnt", "didn't": "didnt",
"cardi b": "cardib"
}
# Replace contractions with expanded forms
pattern = re.compile(r'\b(' + '|'.join(contractions.keys()) + r')\b')
expanded_text = pattern.sub(lambda x: contractions[x.group()], text)
# Remove apostrophes from other instances
expanded_text = re.sub(r"'", '', expanded_text) # Remove all apostrophes
return expanded_text
#Function to cound word in a lyric for a song for analysis
def countword(df):
all_words = []
for lyrics in df['Lyric']:
expanded_lyrics = expand_contractions(lyrics.lower()) # Expand contractions first
words = wordpunct_tokenize(expanded_lyrics) # Tokenize
all_words.extend(words)
return all_words
# function to analyze song lyrics sentiment value using SenticNet
def analyze_sentiment(word_list):
# Initialize SenticNet
sn = SenticNet()
# Dictionary to store polarity and primary emotion for each word
sentiment_data = {}
# Variable to count words not found in SenticNet
count_of_word_not_in_senticNet = 0
# Loop over each word in the list
for word in word_list:
try:
# Attempt to get SenticNet data for the word
sn_data = sn.concept(word)
mood_tags = sn.moodtags(word)
first_mood_tag = mood_tags[0]
second_mood_tag = mood_tags[1]
if sn_data:
# Extract polarity and primary mood from SenticNet data
polarity = sn_data.get('polarity_value', 'N/A')
# Store the sentiment data in the dictionary
sentiment_data[word] = {
'polarity': polarity,
'primary_mood': first_mood_tag,
'secondary_mood': second_mood_tag
}
except Exception as e:
# Increment the count if an error occurs
count_of_word_not_in_senticNet += 1
return sentiment_data, count_of_word_not_in_senticNet
# fixed_moods based on all emotion in sectinet6
fixed_moods = {
"#eagerness": 0,
"#calmness": 0,
"#joy": 0,
"#pleasantess": 0,
"#disgust": 0,
"#sadness": 0,
"#anger": 0,
"#fear": 0
}
def songSentiment_analyzer_by_word_primary_emotion(artist):
# filter the song by Artist
Artist_df = df_main[df_main.Artist == artist]
# Get the total number of songs
total_songs = Artist_df.shape[0]
#print(f"Total number of songs: {total_rows}")
song_mood_category = fixed_moods.copy()
number_of_word_not_in_artist_lyric = 0
for song in range(total_songs):
row_ = Artist_df.iloc[song]
text = row_.words
sentiment_analysis, count = analyze_sentiment(text)
mood_count = fixed_moods.copy()
for word, data in sentiment_analysis.items():
primary_emotion = data['primary_mood']
if primary_emotion in mood_count:
mood_count[primary_emotion] += 1
pass
#print(f"Word: {word} - Polarity: {data['polarity']} - Primary Emotion: {data['primary_mood']}")
#find the most frequent mood that all word in a song represent
highest_mood = max(mood_count, key=mood_count.get)
if highest_mood in song_mood_category:
song_mood_category[highest_mood] += 1
number_of_word_not_in_artist_lyric += count
#print(song_mood_category)
#print(f"Words not found in SenticNet for all song by {artist}: {number_of_word_not_in_artist_lyric}")
#print("")
return total_songs, song_mood_category, number_of_word_not_in_artist_lyric
polarity_list = []
def songSentiment_analyzer_by_polarity_averaging(artist):
# filter the song by Artist
Artist_df = df_main[df_main.Artist == artist]
# Get the total number of songs
total_songs = Artist_df.shape[0]
#print(f"Total number of songs: {total_rows}")
polarity_list_for_avgSONG = polarity_list.copy()
avg_polarity_list_for_artist = polarity_list.copy()
number_of_word_not_in_artist_lyric = 0
for song in range(total_songs): #total_songs
row_ = Artist_df.iloc[song]
text = row_.words
sentiment_analysis, count = analyze_sentiment(text)
for word, data in sentiment_analysis.items():
polarity = data['polarity']
polarity = float(polarity) #senticnet polarity value is in string
#print(polarity)
polarity_list_for_avgSONG.append(polarity)
average_polarity_song = np.mean(polarity_list_for_avgSONG)
#print(average_polarity_song)
avg_polarity_list_for_artist.append(average_polarity_song)
avg_polarity_for_artist = np.mean(avg_polarity_list_for_artist)
return avg_polarity_for_artist
pass
artists = ['Ariana Grande', 'Beyoncé', 'Billie Eilish', 'Cardi B', 'Charlie Puth', 'Coldplay', 'Drake', 'Dua Lipa',
'Ed Sheeran', 'Eminem', 'Justin Bieber', 'Katy Perry', 'Khalid', 'Lady Gaga', 'Maroon 5', 'Nicki Minaj',
'Post Malone', 'Rihanna', 'Selena Gomez', 'Taylor Swift']
# Initialize a list to hold results
df_secticnet = []
for artist in artists:
# Perform sentiment analysis for the current artist
total_songs, song_mood_category, number_of_word_not_in_artist_lyric = songSentiment_analyzer_by_word_primary_emotion(artist)
# Create a dictionary with the artist's data
artist_data = {
"Artist": artist,
"Total Songs": total_songs,
"Words Not Found": number_of_word_not_in_artist_lyric
}
# Add mood categories to the dictionary
artist_data.update(song_mood_category)
# Append the artist's data to the list
df_secticnet.append(artist_data)
# Create a DataFrame from the list of dictionaries
df_senticnet = pd.DataFrame(df_secticnet)
# Display the DataFrame
print(df_senticnet)
# Initialize a list to hold results
df_secticnet_polarity = []
for artist in artists:
# Perform sentiment analysis for the current artist
avg_polarity_for_artist = songSentiment_analyzer_by_polarity_averaging(artist)
# Create a dictionary with the artist's data
artist_data = {
"Artist": artist,
"Average Polarity Score": avg_polarity_for_artist
}
# Append the artist's data to the list
df_secticnet_polarity.append(artist_data)
# Create a DataFrame from the list of dictionaries
df_secticnet_polarity = pd.DataFrame(df_secticnet_polarity)
# Display the DataFrame
print(df_secticnet_polarity)
# Visualization using Matplotlib
plt.figure(figsize=(12, 6))
plt.bar(df_secticnet_polarity["Artist"], df_secticnet_polarity["Average Polarity Score"], color='skyblue')
plt.xticks(rotation=45, ha='right') # Rotate artist names for better readability
plt.xlabel("Artists")
plt.ylabel("Average Polarity Score")
plt.title("Average Polarity Score by Artist")
plt.tight_layout()
# Show the plot
plt.show()
##############################################################################
# DATA STATISTICS #
##############################################################################
# 1) BASIC DESCRIPTIVE STATISTICS FOR THE ENTIRE DATASET
print("========================================")
print(" BASIC DATA STATISTICS ")
print("========================================")
print(f"Total number of songs in df_main: {len(df_main)}")
print(f"Number of unique artists: {df_main['Artist'].nunique()}")
print(f"Year range: {df_main['Year'].min()} to {df_main['Year'].max()}")
print("========================================\n")
# 2) AVERAGE WORD COUNT (LYRIC LENGTH)
# First, compute lyric length as number of tokens in each song lyric
df_main['lyric_length'] = df_main['Lyric'].apply(
lambda x: len(x.split()) if isinstance(x, str) else 0
)
avg_lyric_length_all = df_main['lyric_length'].mean()
median_lyric_length_all = df_main['lyric_length'].median()
std_lyric_length_all = df_main['lyric_length'].std()
print("========================================")
print(" LYRIC LENGTH STATISTICS (ALL SONGS) ")
print("========================================")
print(f"Average lyric length (words): {avg_lyric_length_all:.2f}")
print(f"Median lyric length (words): {median_lyric_length_all:.2f}")
print(f"Standard Deviation: {std_lyric_length_all:.2f}")
print("========================================\n")
# 3) GROUP-BY ARTIST STATISTICS
artist_stats = df_main.groupby('Artist').agg({
'Year': ['mean', 'min', 'max'],
'lyric_length': ['mean', 'min', 'max']
})
# Rename columns for clarity
artist_stats.columns = [
'Avg_Year', 'Min_Year', 'Max_Year',
'Avg_LyricLength', 'Min_LyricLength', 'Max_LyricLength'
]
print("========================================")
print(" STATISTICS GROUPED BY ARTIST ")
print("========================================")
print(artist_stats)
print("========================================\n")
# 4) CORRELATION ANALYSIS (OPTIONAL)
# For example, we can look at whether newer songs tend to have more words.
# This is just one possible correlation. You can do many more!
corr_year_lyriclength = df_main[['Year', 'lyric_length']].corr().iloc[0, 1]
print("========================================")
print(" CORRELATION ANALYSIS ")
print("========================================")
print(f"Correlation between Year and Lyric Length: {corr_year_lyriclength:.4f}")
print("========================================\n")
# 5) ADDITIONAL VISUALIZATIONS (OPTIONAL)
# Example: Plot of Year vs. Average lyric length per year
# Just to demonstrate how you might visualize the correlation.
year_grouped = df_main.groupby('Year')['lyric_length'].mean().reset_index()
plt.figure(figsize=(10, 5))
plt.plot(year_grouped['Year'], year_grouped['lyric_length'], marker='o', linestyle='-', color='purple')
plt.title("Average Lyric Length Over the Years")
plt.xlabel("Year")
plt.ylabel("Average Lyric Length (words)")
plt.grid(True)
plt.show()
can you tell me the flow of this program? such as data collection etc.