Sentiment Analysis Pipeline

Question

#NLTK library for natural language processing
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

Sentiment Analysis

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from senticnet.senticnet import SenticNet

#Dataset management/data handling
import pandas as pd
import numpy as np #for calculation
import string
import re

#Visualization
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns

Suppress warnings

import warnings
warnings.filterwarnings("ignore")

Initialize SenticNet and VADER Sentiment Analyzer

sn = SenticNet()
vader = SentimentIntensityAnalyzer()

import os #for user convenience

Define the base directory

######### Change based on your path here ##########
base_dir = r'C:\Users\user\projects\Python-VS\Sentiment_Analysis\Song_csv'

List of artist names

artists = [
'ArianaGrande', 'Beyonce', 'BillieEilish', 'CardiB', 'CharliePuth',
'ColdPlay', 'Drake', 'DuaLipa', 'EdSheeran', 'Eminem', 'JustinBieber',
'KatyPerry', 'Khalid', 'LadyGaga', 'Maroon5', 'NickiMinaj',
'PostMalone', 'Rihanna', 'SelenaGomez', 'TaylorSwift'
]

Build the list of file paths

file_paths = [os.path.join(base_dir, f"{artist}.csv") for artist in artists]

Print file paths (optional)

for file in file_paths:
print(f"File path: {file}")

Read CSV files into DataFrame list

df_all = [pd.read_csv(file) for file in file_paths]

####### Data preparation ###########

Preprocessing function

Remove any song that are not the original version of an artist's songs

def cleaning(df):
a=[]
i=0
df1=df
title = df['Title']
for t in df['Title']:
r=Re=l=Li=c=m=V=ve=D=rs=edit=Edit=0
r=t.find('remix')
Re=t.find('Remix')
l=t.find('live')
Li=t.find('Live')
V=t.find('Version')
ve=t.find('version')
D=t.find('Demo ')
D=t.find('Demo')
rs=t.find('Reprise')
c=t.find('COPY')
m=t.find('Mix')
edit=t.find('edit')
Edit=t.find('Edit')
if r != -1:
a.append(t)
elif Re != -1:
a.append(t)
elif l != -1:
a.append(t)
elif Li != -1:
a.append(t)
elif V != -1:
a.append(t)
elif ve != -1:
a.append(t)
elif D != -1:
a.append(t)
elif rs != -1:
a.append(t)
elif c != -1:
a.append(t)
elif m != -1:
a.append(t)
elif edit != -1:
a.append(t)
elif Edit != -1:
a.append(t)

text
for t1 in df['Title']:
    for t2 in a:
        if t1 == t2:
            df1=df1.drop(i)
    i=i+1

df1.dropna(subset = ["Title"], inplace=True)
df1.dropna(subset = ["Lyric"], inplace=True)
df1.drop_duplicates(subset ="Title",keep = False, inplace = True)
df1.drop_duplicates(subset ="Lyric",keep = False, inplace = True) 

return df1

Change lyrics into word_list that is easier to be process during analysis

such as, making all lyrics in lowercase to make lyrics not case-sensitive

remove stop word to de-clutter the dataframe

using lemmatazation to only use base word

def lyrics_to_words(document):
if not isinstance(document, str):
return "" # Handle non-string input

text
stop_words = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

stopwordremoval = " ".join([i for i in document.lower().split() if i not in stop_words])
punctuationremoval = ''.join(ch for ch in stopwordremoval if ch not in exclude)
normalized = " ".join(lemma.lemmatize(word) for word in punctuationremoval.split())
return normalized

Extract unique words from lyric in a song

def uniqueWords(df):
def unique(list1):
return list(set(list1)) # Simplified using set

text
words = []
for word in df['Lyric']:
    processed_lyric = lyrics_to_words(word)
    words.append(unique(processed_lyric.split()))
df['words'] = words
return df

Main pipeline

dataframe that is clean

df_allclean = []
for df in df_all:
df_cleaned = cleaning(df)
df_word_unique = uniqueWords(df_cleaned)
df_allclean.append(df_word_unique)

df_main is the dataframe that is cleaned (df_allclean)

frames = df_allclean
df_main = pd.concat(frames,ignore_index=True)
df_main.drop_duplicates(subset=['Title', 'Lyric'], inplace=True)

Remove any song that exist outside the span of 1900 and 2024

df_main['Year'] = pd.to_numeric(df_main['Year'], errors='coerce') # Ensure all values are numeric
df_main['Year'] = df_main['Year'].fillna(0) # Replace NaN with 0
df_main['Year'] = df_main['Year'].astype(int) # Convert to integer
df_main = df_main[(df_main['Year'] >= 1900) & (df_main['Year'] <= 2024)]

remove the index column (unnamed) from csv as it is not needed

for col in ['Unnamed: 0', 'Unnamed: 0.1']:
if col in df_main.columns:
df_main = df_main.drop(col, axis=1)
df_main=df_main.reset_index(drop=True)

Define a function to handle contractions

Here, word like i'm is not considered as I am but iam, this is to reduce the amout of word for each lyrics

and not to stray far from the original form of the song

def expand_contractions(text):
contractions = {
"i'm": "im", "i've": "ive", "i'll": "ill", "i'd": "id", "you're": "youre", "you've": "youve", "you'll": "youll",
"he's": "hes", "she's": "shes", "it's": "its", "they're": "theyre", "they've": "theyve", "we're": "were",
"we've": "weve", "we'll": "well", "they'll": "theyll", "isn't": "isnt", "aren't": "arent", "wasn't": "wasnt",
"weren't": "werent", "hasn't": "hasnt", "haven't": "havent", "hadn't": "hadnt", "won't": "wont", "wouldn't": "wouldnt",
"shouldn't": "shouldnt", "can't": "cant", "couldn't": "couldnt", "don't": "dont", "doesn't": "doesnt", "didn't": "didnt",
"cardi b": "cardib"
}

text
# Replace contractions with expanded forms
pattern = re.compile(r'\b(' + '|'.join(contractions.keys()) + r')\b')
expanded_text = pattern.sub(lambda x: contractions[x.group()], text)

# Remove apostrophes from other instances
expanded_text = re.sub(r"'", '', expanded_text)  # Remove all apostrophes
return expanded_text

#Function to cound word in a lyric for a song for analysis
def countword(df):
all_words = []
for lyrics in df['Lyric']:
expanded_lyrics = expand_contractions(lyrics.lower()) # Expand contractions first
words = wordpunct_tokenize(expanded_lyrics) # Tokenize
all_words.extend(words)
return all_words

'''
########### Basic Analysis / Comparison cleaned dataframe with original #############

Lists to store information

before = []
after = []
length = []
ulength = []
wd = []

Populate before and after counts, and compute word statistics

for df in df_all:
a, b = df.shape # Get the number of rows (a) and columns (b) before cleaning
before.append(a)

for dfc in df_allclean:
a, b = dfc.shape # Get the number of rows (a) and columns (b) after cleaning
after.append(a)
c = countword(dfc)
l = len(c)
ul = len(np.unique(c)) # Unique words
wd.append(c)
length.append(l)
ulength.append(ul)

artists = ['Ariana Grande', 'Beyoncé', 'Billie Eilish', 'Cardi B', 'Charlie Puth', 'Coldplay', 'Drake', 'Dua Lipa',
'Ed Sheeran', 'Eminem', 'Justin Bieber', 'Katy Perry', 'Khalid', 'Lady Gaga', 'Maroon 5', 'Nicki Minaj',
'Post Malone', 'Rihanna', 'Selena Gomez', 'Taylor Swift']

df_info =pd.DataFrame({'name':artists,'before':before,'after':after,'words':wd,'unique words':ulength,'word count':length})
df_info['diff']=df_info['before']-df_info['after']
df_info['words per songs'] = round(df_info['word count'] / df_info['after'],0)
df_info['words per songs'] = df_info['words per songs'].astype('int')
df_info['lexicalrichness']=(df_info['unique words']/df_info['word count'])*100
df_info=df_info[['name','before','after','diff','words','words per songs','unique words','word count','lexicalrichness']]

The analysis for words between cleaned data and original data.

print(df_info)

#Visualitzaion of these analysis

Unique Word VS Total Word Graph for Each Artist (Bar Graph)

fig = go.Figure(data=[
go.Bar(name='Unique Word Count', x=df_info['name'], y=df_info['unique words'].tolist()),
go.Bar(name='Total Word Count', x=df_info['name'], y=df_info['word count'].tolist()),
])
fig.update_layout(barmode='group',title={'text': "Total words vs Unique words",'y':0.9,'x':0.5,'xanchor': 'center','yanchor': 'top'})
fig.show()

Lexicalrichness of all artist based on their songs lyrics

fig = px.bar(df_info, x='name',y='lexicalrichness')
fig.update_layout(title={'text': "Lexicalrichness of all artist",'y':1,'x':0.5,'xanchor': 'center','yanchor': 'top'})
fig.show()

Grouping data by Artist and Year to see trend

df_group = df_main.groupby(['Artist', 'Year']).agg({'Title': 'count', 'Lyric': 'count'}).reset_index()
df_group.rename(columns={'Title': 'Title_Count', 'Lyric': 'Song_Count'}, inplace=True)

Further Grouping: Total Songs per Year Across All Artists

df_temp = df_group.groupby(['Year']).agg({'Title_Count': 'sum'}).reset_index()

Visualization: Number of Songs by All Artists from 1900-2024

fig = px.line(df_temp, x='Year', y='Title_Count',
title="Number of Titles by All Artists from 2001-2024",
labels={'Year': 'Year', 'Title_Count': 'Number of Titles'},
markers=True)

fig.update_layout(
title={
'text': "Number of Titles by All Artists from 1900-2024",
'y':0.95,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'
},
xaxis_title="Year",
yaxis_title="Number of Titles"
)
fig.show()

########## Sentiment Analysis ###########

VADER Sentiment Analyzer function

def setimentanalyzer(df):
# Scores that is negative is considered to give result of negative emotion such as sadness, anger, sarcasm
# Scores that is neutral nearing 0 is considered to give result of neutral emotion such as sadness, anger
# Scores that is positive is considered to give result of positive emotion such as happiness, proud
neg='Negative'
neu='Neutral'
pos='Positive'
negative = []
neutral = []
positive = []
dominant_sentiment=[]
dominant_sentiment_score=[]
#Initialize the model
sid = SentimentIntensityAnalyzer()
#Iterate for each row of lyrics and append the scores
for i in df.index:

text
    scores = sid.polarity_scores(df['Lyric'].iloc[i])
    negative.append(scores['neg'])
    neutral.append(scores['neu'])
    positive.append(scores['pos'])
    if scores['neg']>scores['pos']:
        dominant_sentiment_score.append(scores['neg'])
        dominant_sentiment.append(neg)
    elif scores['neg']<scores['pos']:
        dominant_sentiment_score.append(scores['pos'])
        dominant_sentiment.append(pos)
    else:
        dominant_sentiment_score.append(scores['neu'])
        dominant_sentiment.append(neu)
#Create 5 columns to the main data frame  for each score
df['negative'] = negative
df['neutral'] = neutral
df['positive'] = positive
df['dominant_sentiment']=dominant_sentiment
df['dominant_sentiment_score']=dominant_sentiment_score
return df

df_sentiment = setimentanalyzer(df_main)
print(df_main)
#df_sentiment.to_csv("VADER_Sentiment_Analyzer.csv", index=False)

#Visualization for Sentimentanalysis using VADER Sentiment Analyzer

Temporary storage for artist data

df_temp = []
artists = df_sentiment['Artist'].unique()

Ensure we have enough unique artists for the plots

if len(artists) < 20:
raise ValueError("Not enough unique artists for the plots.")

Subset data for each artist and store in df_temp

for artist in artists:
df_temp.append(df_sentiment[df_sentiment['Artist'] == artist])

Define a custom color palette

custom_palette = {
'Positive': 'green',
'Negative': 'red',
'Neutral': 'blue'
}

Ensure consistent order for the categories

category_order = ['Positive', 'Negative', 'Neutral']

First 9 artists (3x3 grid)

fig, axs = plt.subplots(nrows=3, ncols=3, figsize=(15, 15), constrained_layout=True)

for i in range(9):
row = i // 3
col = i % 3
sns.swarmplot(data=df_temp[i], x="dominant_sentiment", y="dominant_sentiment_score",
ax=axs[row][col], palette=custom_palette, order=category_order)
axs[row][col].set_title(artists[i])
axs[row][col].set_xlabel("Dominant Sentiment")
axs[row][col].set_ylabel("Sentiment Score")

Next 9 artists (3x3 grid)

fig, axs = plt.subplots(nrows=3, ncols=3, figsize=(15, 15), constrained_layout=True)

for i in range(9, 18):
row = (i - 9) // 3
col = (i - 9) % 3
sns.swarmplot(data=df_temp[i], x="dominant_sentiment", y="dominant_sentiment_score",
ax=axs[row][col], palette=custom_palette, order=category_order)
axs[row][col].set_title(artists[i])
axs[row][col].set_xlabel("Dominant Sentiment")
axs[row][col].set_ylabel("Sentiment Score")

Last 2 artists (3x1 grid)

fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(15, 5), constrained_layout=True)

for i in range(18, 20):
col = i - 18
sns.swarmplot(data=df_temp[i], x="dominant_sentiment", y="dominant_sentiment_score",
ax=axs[col], palette=custom_palette, order=category_order)
axs[col].set_title(artists[i])
axs[col].set_xlabel("Dominant Sentiment")
axs[col].set_ylabel("Sentiment Score")

Remove unused axes in the last grid

for col in range(2, 3): # Indices 2 and beyond are unused
axs[col].set_visible(False)

plt.show()

'''

function to analyze song lyrics sentiment value using SenticNet

def analyze_sentiment(word_list):
# Initialize SenticNet
sn = SenticNet()

text
# Dictionary to store polarity and primary emotion for each word
sentiment_data = {}

# Variable to count words not found in SenticNet
count_of_word_not_in_senticNet = 0

# Loop over each word in the list
for word in word_list:
    try:
        # Attempt to get SenticNet data for the word
        sn_data = sn.concept(word)

        mood_tags = sn.moodtags(word)
        first_mood_tag = mood_tags[0]
        second_mood_tag = mood_tags[1]
        
        if sn_data:
            
            # Extract polarity and primary mood from SenticNet data
            polarity = sn_data.get('polarity_value', 'N/A')

            # Store the sentiment data in the dictionary
            sentiment_data[word] = {
                'polarity': polarity,
                'primary_mood': first_mood_tag,
                'secondary_mood': second_mood_tag
            }
    except Exception as e:
        # Increment the count if an error occurs
        count_of_word_not_in_senticNet += 1

return sentiment_data, count_of_word_not_in_senticNet

fixed_moods based on all emotion in sectinet6

fixed_moods = {
"#eagerness": 0,
"#calmness": 0,
"#joy": 0,
"#pleasantess": 0,
"#disgust": 0,
"#sadness": 0,
"#anger": 0,
"#fear": 0
}

def songSentiment_analyzer_by_word_primary_emotion(artist):
# filter the song by Artist
Artist_df = df_main[df_main.Artist == artist]

text
# Get the total number of songs
total_songs = Artist_df.shape[0]
#print(f"Total number of songs: {total_rows}")

song_mood_category = fixed_moods.copy()

number_of_word_not_in_artist_lyric = 0
for song in range(total_songs):
    row_ = Artist_df.iloc[song]
    text = row_.words
    sentiment_analysis, count = analyze_sentiment(text)

    mood_count = fixed_moods.copy()

    for word, data in sentiment_analysis.items():
        primary_emotion = data['primary_mood']

        if primary_emotion in mood_count:
            mood_count[primary_emotion] += 1
        pass
        #print(f"Word: {word} - Polarity: {data['polarity']} - Primary Emotion: {data['primary_mood']}")

    #find the most frequent mood that all word in a song represent
    highest_mood = max(mood_count, key=mood_count.get)

    if highest_mood in song_mood_category:
        song_mood_category[highest_mood] += 1
    number_of_word_not_in_artist_lyric += count

#print(song_mood_category)
#print(f"Words not found in SenticNet for all song by {artist}: {number_of_word_not_in_artist_lyric}")
#print("")
return total_songs, song_mood_category, number_of_word_not_in_artist_lyric

polarity_list = []

def songSentiment_analyzer_by_polarity_averaging(artist):
# filter the song by Artist
Artist_df = df_main[df_main.Artist == artist]

text
# Get the total number of songs
total_songs = Artist_df.shape[0]
#print(f"Total number of songs: {total_rows}")

polarity_list_for_avgSONG = polarity_list.copy()
avg_polarity_list_for_artist = polarity_list.copy()

number_of_word_not_in_artist_lyric = 0
for song in range(total_songs): #total_songs
    row_ = Artist_df.iloc[song]
    text = row_.words
    sentiment_analysis, count = analyze_sentiment(text)

    for word, data in sentiment_analysis.items():
        polarity = data['polarity']
        polarity = float(polarity) #senticnet polarity value is in string

        #print(polarity)
        polarity_list_for_avgSONG.append(polarity)

    average_polarity_song = np.mean(polarity_list_for_avgSONG)
    #print(average_polarity_song)
    avg_polarity_list_for_artist.append(average_polarity_song)

avg_polarity_for_artist = np.mean(avg_polarity_list_for_artist)

return avg_polarity_for_artist

pass

artists = ['Ariana Grande', 'Beyoncé', 'Billie Eilish', 'Cardi B', 'Charlie Puth', 'Coldplay', 'Drake', 'Dua Lipa',
'Ed Sheeran', 'Eminem', 'Justin Bieber', 'Katy Perry', 'Khalid', 'Lady Gaga', 'Maroon 5', 'Nicki Minaj',
'Post Malone', 'Rihanna', 'Selena Gomez', 'Taylor Swift']

Initialize a list to hold results

df_secticnet = []

for artist in artists:
# Perform sentiment analysis for the current artist
total_songs, song_mood_category, number_of_word_not_in_artist_lyric = songSentiment_analyzer_by_word_primary_emotion(artist)

text
# Create a dictionary with the artist's data
artist_data = {
    "Artist": artist,
    "Total Songs": total_songs,
    "Words Not Found": number_of_word_not_in_artist_lyric
}

# Add mood categories to the dictionary
artist_data.update(song_mood_category)

# Append the artist's data to the list
df_secticnet.append(artist_data)

Create a DataFrame from the list of dictionaries

df_senticnet = pd.DataFrame(df_secticnet)

print(df_senticnet)

Visualization Mode-baseed approach of SenticNet

'''
mood_colors = {
'#eagerness': '#e66f6f', # Slightly darker red
'#calmness': '#3380cc', # Slightly darker blue
'#joy': '#66cc66', # Slightly darker green
'#pleasantess': '#e6b36e', # Slightly darker orange
'#disgust': '#8f8fdb', # Slightly darker lavender
'#sadness': '#e680b3', # Slightly darker pink
'#anger': '#80cc80', # Slightly darker light green
'#fear': '#e64d4d', # Slightly darker red
'Other': '#b3b3b3' # Slightly darker gray
}

Iterate over each artist and plot a pie chart for their mood distribution

for index, row in df_senticnet.iterrows():
# Extract the mood distribution for the artist
mood_data = row[['#eagerness', '#calmness', '#joy', '#pleasantess', '#disgust', '#sadness', '#anger', '#fear']]

text
# Calculate the total sum of the mood data
total = mood_data.sum()

# Calculate percentages
percentages = (mood_data / total) * 100

# Group values below 2% into an "Other" category
filtered_data = mood_data[percentages >= 2]
other_value = mood_data[percentages < 2].sum()

# Add the "Other" category if it has a value
if other_value > 0:
    filtered_data['Other'] = other_value

# Sort filtered_data to maintain order in pie chart
filtered_data = filtered_data.sort_index()

# Prepare colors for the filtered data
colors = [mood_colors[mood] for mood in filtered_data.index]

# Plot the pie chart
plt.figure(figsize=(6, 6))
plt.pie(
    filtered_data,
    labels=filtered_data.index,
    autopct=lambda p: f'{p:.1f}%' if p > 2 else '',
    startangle=140,
    colors=colors
)
plt.title(f"Mood Distribution for {row['Artist']}")
plt.show()

'''

Initialize a list to hold results

df_secticnet_polarity = []

for artist in artists:
# Perform sentiment analysis for the current artist
avg_polarity_for_artist = songSentiment_analyzer_by_polarity_averaging(artist)

text
# Create a dictionary with the artist's data
artist_data = {
    "Artist": artist,
    "Average Polarity Score": avg_polarity_for_artist
}

# Append the artist's data to the list
df_secticnet_polarity.append(artist_data)

Create a DataFrame from the list of dictionaries

df_secticnet_polarity = pd.DataFrame(df_secticnet_polarity)

Display the DataFrame

print(df_secticnet_polarity)

Visualization of SenticNet Polarity Score

plt.figure(figsize=(12, 6))
plt.bar(df_secticnet_polarity["Artist"], df_secticnet_polarity["Average Polarity Score"], color='skyblue')
plt.xticks(rotation=45, ha='right') # Rotate artist names for better readability
plt.xlabel("Artists")
plt.ylabel("Average Polarity Score")
plt.title("Average Polarity Score by Artist")
plt.tight_layout()

Show the plot

plt.show()

can you identify the flow of this program? starting from data collection etc, letsay there is no data collection then no need tomention

Answer 1

Below is an overview of the key steps in the program’s flow, from loading CSV files through cleaning and analyzing the data with both VADER and SenticNet, and then visualizing the results. (Since the question says “there is no data collection,” we’ll assume that the CSVs already exist, so we begin at the stage of reading them.)

1. Library Imports

NLTK for natural language processing (tokenization, stopwords, lemmatization, VADER sentiment).
SenticNet for concept-level sentiment analysis (polarity and primary emotion).
Pandas and NumPy for data handling and analysis.
Matplotlib, Plotly, and Seaborn for data visualization.
Others like os, re, string, warnings for file handling, regular expressions, punctuation removal, and so on.

2. Reading & Combining CSV Files

Define base_dir (the directory where CSV files are located).
Build a list of file paths (file_paths) for all artists.
Read each CSV into a list of DataFrames (df_all).

3. Data Cleaning & Preprocessing

3.1 Filtering Out Non-Original Song Versions

A custom cleaning(df) function checks each song’s title for keywords like "remix," "live," "version," etc.
Matching titles are dropped to keep only original versions of songs.
Rows with missing or duplicate lyrics/titles are also removed.

3.2 Converting Lyrics to Word Lists

A custom lyrics_to_words(document) function lowercases lyrics, removes stopwords, punctuation, and applies lemmatization.

3.3 Creating a “words” Column

A uniqueWords(df) function extracts unique words in each lyric, storing them in a df['words'] column.

3.4 Consolidating All Cleaned Data

Each individual DataFrame in df_all is cleaned and processed into df_allclean.
These cleaned DataFrames are concatenated into one large DataFrame, df_main.
Only songs with years between 1900 and 2024 are retained.
Unwanted or unnamed columns are dropped, and the index is reset.

4. (Optional) Exploratory Analysis & Visualization

Counting words before and after cleaning to see how many songs and words remain.
Basic bar charts, line plots, etc., to compare word counts, unique words, and lexical richness across artists.

(This portion of the code does grouping by artist and year, building bar and line plots for the number of songs, unique words, etc.)

5. VADER Sentiment Analysis

A function setimentanalyzer(df) iterates through each lyric, uses VADER to get sentiment scores (negative, neutral, positive).
Adds the following columns to df:
- negative
- neutral
- positive
- dominant_sentiment (labeled as Positive, Negative, or Neutral, depending on which score is highest)
- dominant_sentiment_score (numerical value of the dominant sentiment)
Visualizations using Seaborn swarm plots are created to show distributions of VADER sentiment scores per artist.

6. SenticNet Concept-Level Analysis

6.1 Primary Emotions (Mood Tags)

analyze_sentiment(word_list) checks each word in the lyric against SenticNet to retrieve:
- polarity_value (e.g., a float indicating positivity/negativity)
- moodtags (primary/secondary emotion like #joy, #fear, etc.)
A loop tallies how many words are mapped to each primary emotion.

6.2 Aggregating Top Mood per Song

songSentiment_analyzer_by_word_primary_emotion(artist) tallies the dominant mood for each song (the most frequent mood among all words in that song).
Results are aggregated across all songs per artist to see how many songs primarily lean toward each mood.

6.3 Polarity Averaging

songSentiment_analyzer_by_polarity_averaging(artist) calculates the average polarity for each song, then finds the mean for the entire artist’s discography.
A DataFrame df_secticnet_polarity stores these average polarity scores for each artist.

6.4 Visualizations

Bar charts and (optionally) pie charts to illustrate:
- Song-level emotion breakdown
- Average polarity scores across artists

7. Results and Output

The code prints or plots final DataFrames (e.g., df_senticnet, df_secticnet_polarity) and generates visualizations (swarm plots, bar plots, etc.) comparing artists across different sentiment metrics.

Summary of the Flow

Read CSV files for each artist (already saved in a local directory).
Clean & preprocess lyrics (remove non-original versions, duplicates, etc.).
Compile all artists’ songs into a single DataFrame.
VADER sentiment analysis on each song's lyrics.
SenticNet analysis to retrieve concept-level mood tags and polarity.
Aggregate & visualize the results (dominant moods per artist, average polarity, etc.).

This end-to-end process transforms raw CSV lyrics data into cleaned, sentiment-annotated data and provides insights into overall mood and polarity trends across multiple artists.

Domanda

Sentiment Analysis

Suppress warnings

Initialize SenticNet and VADER Sentiment Analyzer

Define the base directory

List of artist names

Build the list of file paths

Print file paths (optional)

Read CSV files into DataFrame list

Preprocessing function

Remove any song that are not the original version of an artist's songs

Change lyrics into word_list that is easier to be process during analysis

such as, making all lyrics in lowercase to make lyrics not case-sensitive

remove stop word to de-clutter the dataframe

using lemmatazation to only use base word

Extract unique words from lyric in a song

Main pipeline

dataframe that is clean

df_main is the dataframe that is cleaned (df_allclean)

Remove any song that exist outside the span of 1900 and 2024

remove the index column (unnamed) from csv as it is not needed

Define a function to handle contractions

Here, word like i'm is not considered as I am but iam, this is to reduce the amout of word for each lyrics

and not to stray far from the original form of the song

Lists to store information

Populate before and after counts, and compute word statistics

The analysis for words between cleaned data and original data.

Unique Word VS Total Word Graph for Each Artist (Bar Graph)

Lexicalrichness of all artist based on their songs lyrics

Grouping data by Artist and Year to see trend

Further Grouping: Total Songs per Year Across All Artists

Visualization: Number of Songs by All Artists from 1900-2024

VADER Sentiment Analyzer function

Temporary storage for artist data

Ensure we have enough unique artists for the plots

Subset data for each artist and store in df_temp

Define a custom color palette

Ensure consistent order for the categories

First 9 artists (3x3 grid)

Next 9 artists (3x3 grid)

Last 2 artists (3x1 grid)

Remove unused axes in the last grid

function to analyze song lyrics sentiment value using SenticNet

fixed_moods based on all emotion in sectinet6

Initialize a list to hold results

Create a DataFrame from the list of dictionaries

Visualization Mode-baseed approach of SenticNet

Iterate over each artist and plot a pie chart for their mood distribution

Initialize a list to hold results

Create a DataFrame from the list of dictionaries

Display the DataFrame

Visualization of SenticNet Polarity Score

Show the plot

Pensiero

Risposta

1. Library Imports

2. Reading & Combining CSV Files

3. Data Cleaning & Preprocessing

3.1 Filtering Out Non-Original Song Versions

3.2 Converting Lyrics to Word Lists

3.3 Creating a “words” Column

3.4 Consolidating All Cleaned Data

4. (Optional) Exploratory Analysis & Visualization

5. VADER Sentiment Analysis

6. SenticNet Concept-Level Analysis

6.1 Primary Emotions (Mood Tags)

6.2 Aggregating Top Mood per Song

6.3 Polarity Averaging

6.4 Visualizations

7. Results and Output

Summary of the Flow

Condividi questa Q&A