Sentiment Analysis Pipeline
Creato il: 11 gennaio 2025
Creato il: 11 gennaio 2025
#NLTK library for natural language processing
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from senticnet.senticnet import SenticNet
#Dataset management/data handling
import pandas as pd
import numpy as np #for calculation
import string
import re
#Visualization
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
sn = SenticNet()
vader = SentimentIntensityAnalyzer()
import os #for user convenience
######### Change based on your path here ##########
base_dir = r'C:\Users\user\projects\Python-VS\Sentiment_Analysis\Song_csv'
artists = [
'ArianaGrande', 'Beyonce', 'BillieEilish', 'CardiB', 'CharliePuth',
'ColdPlay', 'Drake', 'DuaLipa', 'EdSheeran', 'Eminem', 'JustinBieber',
'KatyPerry', 'Khalid', 'LadyGaga', 'Maroon5', 'NickiMinaj',
'PostMalone', 'Rihanna', 'SelenaGomez', 'TaylorSwift'
]
file_paths = [os.path.join(base_dir, f"{artist}.csv") for artist in artists]
for file in file_paths:
print(f"File path: {file}")
df_all = [pd.read_csv(file) for file in file_paths]
####### Data preparation ###########
def cleaning(df):
a=[]
i=0
df1=df
title = df['Title']
for t in df['Title']:
r=Re=l=Li=c=m=V=ve=D=rs=edit=Edit=0
r=t.find('remix')
Re=t.find('Remix')
l=t.find('live')
Li=t.find('Live')
V=t.find('Version')
ve=t.find('version')
D=t.find('Demo ')
D=t.find('Demo')
rs=t.find('Reprise')
c=t.find('COPY')
m=t.find('Mix')
edit=t.find('edit')
Edit=t.find('Edit')
if r != -1:
a.append(t)
elif Re != -1:
a.append(t)
elif l != -1:
a.append(t)
elif Li != -1:
a.append(t)
elif V != -1:
a.append(t)
elif ve != -1:
a.append(t)
elif D != -1:
a.append(t)
elif rs != -1:
a.append(t)
elif c != -1:
a.append(t)
elif m != -1:
a.append(t)
elif edit != -1:
a.append(t)
elif Edit != -1:
a.append(t)
textfor t1 in df['Title']: for t2 in a: if t1 == t2: df1=df1.drop(i) i=i+1 df1.dropna(subset = ["Title"], inplace=True) df1.dropna(subset = ["Lyric"], inplace=True) df1.drop_duplicates(subset ="Title",keep = False, inplace = True) df1.drop_duplicates(subset ="Lyric",keep = False, inplace = True) return df1
def lyrics_to_words(document):
if not isinstance(document, str):
return "" # Handle non-string input
textstop_words = set(stopwords.words('english')) exclude = set(string.punctuation) lemma = WordNetLemmatizer() stopwordremoval = " ".join([i for i in document.lower().split() if i not in stop_words]) punctuationremoval = ''.join(ch for ch in stopwordremoval if ch not in exclude) normalized = " ".join(lemma.lemmatize(word) for word in punctuationremoval.split()) return normalized
def uniqueWords(df):
def unique(list1):
return list(set(list1)) # Simplified using set
textwords = [] for word in df['Lyric']: processed_lyric = lyrics_to_words(word) words.append(unique(processed_lyric.split())) df['words'] = words return df
df_allclean = []
for df in df_all:
df_cleaned = cleaning(df)
df_word_unique = uniqueWords(df_cleaned)
df_allclean.append(df_word_unique)
frames = df_allclean
df_main = pd.concat(frames,ignore_index=True)
df_main.drop_duplicates(subset=['Title', 'Lyric'], inplace=True)
df_main['Year'] = pd.to_numeric(df_main['Year'], errors='coerce') # Ensure all values are numeric
df_main['Year'] = df_main['Year'].fillna(0) # Replace NaN with 0
df_main['Year'] = df_main['Year'].astype(int) # Convert to integer
df_main = df_main[(df_main['Year'] >= 1900) & (df_main['Year'] <= 2024)]
for col in ['Unnamed: 0', 'Unnamed: 0.1']:
if col in df_main.columns:
df_main = df_main.drop(col, axis=1)
df_main=df_main.reset_index(drop=True)
def expand_contractions(text):
contractions = {
"i'm": "im", "i've": "ive", "i'll": "ill", "i'd": "id", "you're": "youre", "you've": "youve", "you'll": "youll",
"he's": "hes", "she's": "shes", "it's": "its", "they're": "theyre", "they've": "theyve", "we're": "were",
"we've": "weve", "we'll": "well", "they'll": "theyll", "isn't": "isnt", "aren't": "arent", "wasn't": "wasnt",
"weren't": "werent", "hasn't": "hasnt", "haven't": "havent", "hadn't": "hadnt", "won't": "wont", "wouldn't": "wouldnt",
"shouldn't": "shouldnt", "can't": "cant", "couldn't": "couldnt", "don't": "dont", "doesn't": "doesnt", "didn't": "didnt",
"cardi b": "cardib"
}
text# Replace contractions with expanded forms pattern = re.compile(r'\b(' + '|'.join(contractions.keys()) + r')\b') expanded_text = pattern.sub(lambda x: contractions[x.group()], text) # Remove apostrophes from other instances expanded_text = re.sub(r"'", '', expanded_text) # Remove all apostrophes return expanded_text
#Function to cound word in a lyric for a song for analysis
def countword(df):
all_words = []
for lyrics in df['Lyric']:
expanded_lyrics = expand_contractions(lyrics.lower()) # Expand contractions first
words = wordpunct_tokenize(expanded_lyrics) # Tokenize
all_words.extend(words)
return all_words
'''
########### Basic Analysis / Comparison cleaned dataframe with original #############
before = []
after = []
length = []
ulength = []
wd = []
for df in df_all:
a, b = df.shape # Get the number of rows (a) and columns (b) before cleaning
before.append(a)
for dfc in df_allclean:
a, b = dfc.shape # Get the number of rows (a) and columns (b) after cleaning
after.append(a)
c = countword(dfc)
l = len(c)
ul = len(np.unique(c)) # Unique words
wd.append(c)
length.append(l)
ulength.append(ul)
artists = ['Ariana Grande', 'Beyoncé', 'Billie Eilish', 'Cardi B', 'Charlie Puth', 'Coldplay', 'Drake', 'Dua Lipa',
'Ed Sheeran', 'Eminem', 'Justin Bieber', 'Katy Perry', 'Khalid', 'Lady Gaga', 'Maroon 5', 'Nicki Minaj',
'Post Malone', 'Rihanna', 'Selena Gomez', 'Taylor Swift']
df_info =pd.DataFrame({'name':artists,'before':before,'after':after,'words':wd,'unique words':ulength,'word count':length})
df_info['diff']=df_info['before']-df_info['after']
df_info['words per songs'] = round(df_info['word count'] / df_info['after'],0)
df_info['words per songs'] = df_info['words per songs'].astype('int')
df_info['lexicalrichness']=(df_info['unique words']/df_info['word count'])*100
df_info=df_info[['name','before','after','diff','words','words per songs','unique words','word count','lexicalrichness']]
print(df_info)
#Visualitzaion of these analysis
fig = go.Figure(data=[
go.Bar(name='Unique Word Count', x=df_info['name'], y=df_info['unique words'].tolist()),
go.Bar(name='Total Word Count', x=df_info['name'], y=df_info['word count'].tolist()),
])
fig.update_layout(barmode='group',title={'text': "Total words vs Unique words",'y':0.9,'x':0.5,'xanchor': 'center','yanchor': 'top'})
fig.show()
fig = px.bar(df_info, x='name',y='lexicalrichness')
fig.update_layout(title={'text': "Lexicalrichness of all artist",'y':1,'x':0.5,'xanchor': 'center','yanchor': 'top'})
fig.show()
df_group = df_main.groupby(['Artist', 'Year']).agg({'Title': 'count', 'Lyric': 'count'}).reset_index()
df_group.rename(columns={'Title': 'Title_Count', 'Lyric': 'Song_Count'}, inplace=True)
df_temp = df_group.groupby(['Year']).agg({'Title_Count': 'sum'}).reset_index()
fig = px.line(df_temp, x='Year', y='Title_Count',
title="Number of Titles by All Artists from 2001-2024",
labels={'Year': 'Year', 'Title_Count': 'Number of Titles'},
markers=True)
fig.update_layout(
title={
'text': "Number of Titles by All Artists from 1900-2024",
'y':0.95,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'
},
xaxis_title="Year",
yaxis_title="Number of Titles"
)
fig.show()
########## Sentiment Analysis ###########
def setimentanalyzer(df):
# Scores that is negative is considered to give result of negative emotion such as sadness, anger, sarcasm
# Scores that is neutral nearing 0 is considered to give result of neutral emotion such as sadness, anger
# Scores that is positive is considered to give result of positive emotion such as happiness, proud
neg='Negative'
neu='Neutral'
pos='Positive'
negative = []
neutral = []
positive = []
dominant_sentiment=[]
dominant_sentiment_score=[]
#Initialize the model
sid = SentimentIntensityAnalyzer()
#Iterate for each row of lyrics and append the scores
for i in df.index:
textscores = sid.polarity_scores(df['Lyric'].iloc[i]) negative.append(scores['neg']) neutral.append(scores['neu']) positive.append(scores['pos']) if scores['neg']>scores['pos']: dominant_sentiment_score.append(scores['neg']) dominant_sentiment.append(neg) elif scores['neg']<scores['pos']: dominant_sentiment_score.append(scores['pos']) dominant_sentiment.append(pos) else: dominant_sentiment_score.append(scores['neu']) dominant_sentiment.append(neu) #Create 5 columns to the main data frame for each score df['negative'] = negative df['neutral'] = neutral df['positive'] = positive df['dominant_sentiment']=dominant_sentiment df['dominant_sentiment_score']=dominant_sentiment_score return df
df_sentiment = setimentanalyzer(df_main)
print(df_main)
#df_sentiment.to_csv("VADER_Sentiment_Analyzer.csv", index=False)
#Visualization for Sentimentanalysis using VADER Sentiment Analyzer
df_temp = []
artists = df_sentiment['Artist'].unique()
if len(artists) < 20:
raise ValueError("Not enough unique artists for the plots.")
for artist in artists:
df_temp.append(df_sentiment[df_sentiment['Artist'] == artist])
custom_palette = {
'Positive': 'green',
'Negative': 'red',
'Neutral': 'blue'
}
category_order = ['Positive', 'Negative', 'Neutral']
fig, axs = plt.subplots(nrows=3, ncols=3, figsize=(15, 15), constrained_layout=True)
for i in range(9):
row = i // 3
col = i % 3
sns.swarmplot(data=df_temp[i], x="dominant_sentiment", y="dominant_sentiment_score",
ax=axs[row][col], palette=custom_palette, order=category_order)
axs[row][col].set_title(artists[i])
axs[row][col].set_xlabel("Dominant Sentiment")
axs[row][col].set_ylabel("Sentiment Score")
fig, axs = plt.subplots(nrows=3, ncols=3, figsize=(15, 15), constrained_layout=True)
for i in range(9, 18):
row = (i - 9) // 3
col = (i - 9) % 3
sns.swarmplot(data=df_temp[i], x="dominant_sentiment", y="dominant_sentiment_score",
ax=axs[row][col], palette=custom_palette, order=category_order)
axs[row][col].set_title(artists[i])
axs[row][col].set_xlabel("Dominant Sentiment")
axs[row][col].set_ylabel("Sentiment Score")
fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(15, 5), constrained_layout=True)
for i in range(18, 20):
col = i - 18
sns.swarmplot(data=df_temp[i], x="dominant_sentiment", y="dominant_sentiment_score",
ax=axs[col], palette=custom_palette, order=category_order)
axs[col].set_title(artists[i])
axs[col].set_xlabel("Dominant Sentiment")
axs[col].set_ylabel("Sentiment Score")
for col in range(2, 3): # Indices 2 and beyond are unused
axs[col].set_visible(False)
plt.show()
'''
def analyze_sentiment(word_list):
# Initialize SenticNet
sn = SenticNet()
text# Dictionary to store polarity and primary emotion for each word sentiment_data = {} # Variable to count words not found in SenticNet count_of_word_not_in_senticNet = 0 # Loop over each word in the list for word in word_list: try: # Attempt to get SenticNet data for the word sn_data = sn.concept(word) mood_tags = sn.moodtags(word) first_mood_tag = mood_tags[0] second_mood_tag = mood_tags[1] if sn_data: # Extract polarity and primary mood from SenticNet data polarity = sn_data.get('polarity_value', 'N/A') # Store the sentiment data in the dictionary sentiment_data[word] = { 'polarity': polarity, 'primary_mood': first_mood_tag, 'secondary_mood': second_mood_tag } except Exception as e: # Increment the count if an error occurs count_of_word_not_in_senticNet += 1 return sentiment_data, count_of_word_not_in_senticNet
fixed_moods = {
"#eagerness": 0,
"#calmness": 0,
"#joy": 0,
"#pleasantess": 0,
"#disgust": 0,
"#sadness": 0,
"#anger": 0,
"#fear": 0
}
def songSentiment_analyzer_by_word_primary_emotion(artist):
# filter the song by Artist
Artist_df = df_main[df_main.Artist == artist]
text# Get the total number of songs total_songs = Artist_df.shape[0] #print(f"Total number of songs: {total_rows}") song_mood_category = fixed_moods.copy() number_of_word_not_in_artist_lyric = 0 for song in range(total_songs): row_ = Artist_df.iloc[song] text = row_.words sentiment_analysis, count = analyze_sentiment(text) mood_count = fixed_moods.copy() for word, data in sentiment_analysis.items(): primary_emotion = data['primary_mood'] if primary_emotion in mood_count: mood_count[primary_emotion] += 1 pass #print(f"Word: {word} - Polarity: {data['polarity']} - Primary Emotion: {data['primary_mood']}") #find the most frequent mood that all word in a song represent highest_mood = max(mood_count, key=mood_count.get) if highest_mood in song_mood_category: song_mood_category[highest_mood] += 1 number_of_word_not_in_artist_lyric += count #print(song_mood_category) #print(f"Words not found in SenticNet for all song by {artist}: {number_of_word_not_in_artist_lyric}") #print("") return total_songs, song_mood_category, number_of_word_not_in_artist_lyric
polarity_list = []
def songSentiment_analyzer_by_polarity_averaging(artist):
# filter the song by Artist
Artist_df = df_main[df_main.Artist == artist]
text# Get the total number of songs total_songs = Artist_df.shape[0] #print(f"Total number of songs: {total_rows}") polarity_list_for_avgSONG = polarity_list.copy() avg_polarity_list_for_artist = polarity_list.copy() number_of_word_not_in_artist_lyric = 0 for song in range(total_songs): #total_songs row_ = Artist_df.iloc[song] text = row_.words sentiment_analysis, count = analyze_sentiment(text) for word, data in sentiment_analysis.items(): polarity = data['polarity'] polarity = float(polarity) #senticnet polarity value is in string #print(polarity) polarity_list_for_avgSONG.append(polarity) average_polarity_song = np.mean(polarity_list_for_avgSONG) #print(average_polarity_song) avg_polarity_list_for_artist.append(average_polarity_song) avg_polarity_for_artist = np.mean(avg_polarity_list_for_artist) return avg_polarity_for_artist pass
artists = ['Ariana Grande', 'Beyoncé', 'Billie Eilish', 'Cardi B', 'Charlie Puth', 'Coldplay', 'Drake', 'Dua Lipa',
'Ed Sheeran', 'Eminem', 'Justin Bieber', 'Katy Perry', 'Khalid', 'Lady Gaga', 'Maroon 5', 'Nicki Minaj',
'Post Malone', 'Rihanna', 'Selena Gomez', 'Taylor Swift']
df_secticnet = []
for artist in artists:
# Perform sentiment analysis for the current artist
total_songs, song_mood_category, number_of_word_not_in_artist_lyric = songSentiment_analyzer_by_word_primary_emotion(artist)
text# Create a dictionary with the artist's data artist_data = { "Artist": artist, "Total Songs": total_songs, "Words Not Found": number_of_word_not_in_artist_lyric } # Add mood categories to the dictionary artist_data.update(song_mood_category) # Append the artist's data to the list df_secticnet.append(artist_data)
df_senticnet = pd.DataFrame(df_secticnet)
print(df_senticnet)
'''
mood_colors = {
'#eagerness': '#e66f6f', # Slightly darker red
'#calmness': '#3380cc', # Slightly darker blue
'#joy': '#66cc66', # Slightly darker green
'#pleasantess': '#e6b36e', # Slightly darker orange
'#disgust': '#8f8fdb', # Slightly darker lavender
'#sadness': '#e680b3', # Slightly darker pink
'#anger': '#80cc80', # Slightly darker light green
'#fear': '#e64d4d', # Slightly darker red
'Other': '#b3b3b3' # Slightly darker gray
}
for index, row in df_senticnet.iterrows():
# Extract the mood distribution for the artist
mood_data = row[['#eagerness', '#calmness', '#joy', '#pleasantess', '#disgust', '#sadness', '#anger', '#fear']]
text# Calculate the total sum of the mood data total = mood_data.sum() # Calculate percentages percentages = (mood_data / total) * 100 # Group values below 2% into an "Other" category filtered_data = mood_data[percentages >= 2] other_value = mood_data[percentages < 2].sum() # Add the "Other" category if it has a value if other_value > 0: filtered_data['Other'] = other_value # Sort filtered_data to maintain order in pie chart filtered_data = filtered_data.sort_index() # Prepare colors for the filtered data colors = [mood_colors[mood] for mood in filtered_data.index] # Plot the pie chart plt.figure(figsize=(6, 6)) plt.pie( filtered_data, labels=filtered_data.index, autopct=lambda p: f'{p:.1f}%' if p > 2 else '', startangle=140, colors=colors ) plt.title(f"Mood Distribution for {row['Artist']}") plt.show()
'''
df_secticnet_polarity = []
for artist in artists:
# Perform sentiment analysis for the current artist
avg_polarity_for_artist = songSentiment_analyzer_by_polarity_averaging(artist)
text# Create a dictionary with the artist's data artist_data = { "Artist": artist, "Average Polarity Score": avg_polarity_for_artist } # Append the artist's data to the list df_secticnet_polarity.append(artist_data)
df_secticnet_polarity = pd.DataFrame(df_secticnet_polarity)
print(df_secticnet_polarity)
plt.figure(figsize=(12, 6))
plt.bar(df_secticnet_polarity["Artist"], df_secticnet_polarity["Average Polarity Score"], color='skyblue')
plt.xticks(rotation=45, ha='right') # Rotate artist names for better readability
plt.xlabel("Artists")
plt.ylabel("Average Polarity Score")
plt.title("Average Polarity Score by Artist")
plt.tight_layout()
plt.show()
can you identify the flow of this program? starting from data collection etc, letsay there is no data collection then no need tomention
Below is an overview of the key steps in the program’s flow, from loading CSV files through cleaning and analyzing the data with both VADER and SenticNet, and then visualizing the results. (Since the question says “there is no data collection,” we’ll assume that the CSVs already exist, so we begin at the stage of reading them.)
os
, re
, string
, warnings
for file handling, regular expressions, punctuation removal, and so on.base_dir
(the directory where CSV files are located).file_paths
) for all artists.df_all
).cleaning(df)
function checks each song’s title for keywords like "remix," "live," "version," etc.lyrics_to_words(document)
function lowercases lyrics, removes stopwords, punctuation, and applies lemmatization.uniqueWords(df)
function extracts unique words in each lyric, storing them in a df['words']
column.df_all
is cleaned and processed into df_allclean
.df_main
.(This portion of the code does grouping by artist and year, building bar and line plots for the number of songs, unique words, etc.)
A function setimentanalyzer(df)
iterates through each lyric, uses VADER to get sentiment scores (negative, neutral, positive).
Adds the following columns to df
:
Visualizations using Seaborn swarm plots are created to show distributions of VADER sentiment scores per artist.
analyze_sentiment(word_list)
checks each word in the lyric against SenticNet to retrieve:
songSentiment_analyzer_by_word_primary_emotion(artist)
tallies the dominant mood for each song (the most frequent mood among all words in that song).songSentiment_analyzer_by_polarity_averaging(artist)
calculates the average polarity for each song, then finds the mean for the entire artist’s discography.df_secticnet_polarity
stores these average polarity scores for each artist.df_senticnet
, df_secticnet_polarity
) and generates visualizations (swarm plots, bar plots, etc.) comparing artists across different sentiment metrics.This end-to-end process transforms raw CSV lyrics data into cleaned, sentiment-annotated data and provides insights into overall mood and polarity trends across multiple artists.