Song Data Cleaning Process

Question

#NLTK library for natural language processing
import nltk
nltk.download('vader_lexicon')
from nltk.tokenize import word_tokenize
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

Sentiment Analysis

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from senticnet.senticnet import SenticNet

#Dataset management/data handling
import pandas as pd
import numpy as np #for calculation
import string
import re

#Visualization
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns

Suppress warnings

import warnings
warnings.filterwarnings("ignore")

Initialize SenticNet and VADER Sentiment Analyzer

sn = SenticNet()
vader = SentimentIntensityAnalyzer()

import os #for user convenience

Define the base directory

######### Change based on your path here ##########
base_dir = r'Song_csv'

List of artist names

artists = [
'ArianaGrande', 'Beyonce', 'BillieEilish', 'CardiB', 'CharliePuth',
'ColdPlay', 'Drake', 'DuaLipa', 'EdSheeran', 'Eminem', 'JustinBieber',
'KatyPerry', 'Khalid', 'LadyGaga', 'Maroon5', 'NickiMinaj',
'PostMalone', 'Rihanna', 'SelenaGomez', 'TaylorSwift'
]

Build the list of file paths

file_paths = [os.path.join(base_dir, f"{artist}.csv") for artist in artists]

Print file paths (optional)

for file in file_paths:
print(f"File path: {file}")

Read CSV files into DataFrame list

df_all = [pd.read_csv(file) for file in file_paths]

####### Data preparation ###########

Preprocessing function

Remove any song that are not the original version of an artist's songs

def cleaning(df):
a=[]
i=0
df1=df
title = df['Title']
for t in df['Title']:
r=Re=l=Li=c=m=V=ve=D=rs=edit=Edit=0
r=t.find('remix')
Re=t.find('Remix')
l=t.find('live')
Li=t.find('Live')
V=t.find('Version')
ve=t.find('version')
D=t.find('Demo ')
D=t.find('Demo')
rs=t.find('Reprise')
c=t.find('COPY')
m=t.find('Mix')
edit=t.find('edit')
Edit=t.find('Edit')
if r != -1:
a.append(t)
elif Re != -1:
a.append(t)
elif l != -1:
a.append(t)
elif Li != -1:
a.append(t)
elif V != -1:
a.append(t)
elif ve != -1:
a.append(t)
elif D != -1:
a.append(t)
elif rs != -1:
a.append(t)
elif c != -1:
a.append(t)
elif m != -1:
a.append(t)
elif edit != -1:
a.append(t)
elif Edit != -1:
a.append(t)

text
for t1 in df['Title']:
    for t2 in a:
        if t1 == t2:
            df1=df1.drop(i)
    i=i+1

df1.dropna(subset = ["Title"], inplace=True)
df1.dropna(subset = ["Lyric"], inplace=True)
df1.drop_duplicates(subset ="Title",keep = False, inplace = True)
df1.drop_duplicates(subset ="Lyric",keep = False, inplace = True) 

return df1

Change lyrics into word_list that is easier to be process during analysis

such as, making all lyrics in lowercase to make lyrics not case-sensitive

remove stop word to de-clutter the dataframe

using lemmatazation to only use base word

def lyrics_to_words(document):
if not isinstance(document, str):
return "" # Handle non-string input

text
stop_words = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

stopwordremoval = " ".join([i for i in document.lower().split() if i not in stop_words])
punctuationremoval = ''.join(ch for ch in stopwordremoval if ch not in exclude)
normalized = " ".join(lemma.lemmatize(word) for word in punctuationremoval.split())
return normalized

Extract unique words from lyric in a song

def uniqueWords(df):
def unique(list1):
return list(set(list1)) # Simplified using set

text
words = []
for word in df['Lyric']:
    processed_lyric = lyrics_to_words(word)
    words.append(unique(processed_lyric.split()))
df['words'] = words
return df

Main pipeline

dataframe that is clean

df_allclean = []
for df in df_all:
df_cleaned = cleaning(df)
df_word_unique = uniqueWords(df_cleaned)
df_allclean.append(df_word_unique)

df_main is the dataframe that is cleaned (df_allclean)

frames = df_allclean
df_main = pd.concat(frames,ignore_index=True)
df_main.drop_duplicates(subset=['Title', 'Lyric'], inplace=True)

Remove any song that exist outside the span of 1900 and 2024

df_main['Year'] = pd.to_numeric(df_main['Year'], errors='coerce') # Ensure all values are numeric
df_main['Year'] = df_main['Year'].fillna(0) # Replace NaN with 0
df_main['Year'] = df_main['Year'].astype(int) # Convert to integer
df_main = df_main[(df_main['Year'] >= 1900) & (df_main['Year'] <= 2024)]

remove the index column (unnamed) from csv as it is not needed

for col in ['Unnamed: 0', 'Unnamed: 0.1']:
if col in df_main.columns:
df_main = df_main.drop(col, axis=1)
df_main=df_main.reset_index(drop=True)

Define a function to handle contractions

Here, word like i'm is not considered as I am but iam, this is to reduce the amout of word for each lyrics

and not to stray far from the original form of the song

def expand_contractions(text):
contractions = {
"i'm": "im", "i've": "ive", "i'll": "ill", "i'd": "id", "you're": "youre", "you've": "youve", "you'll": "youll",
"he's": "hes", "she's": "shes", "it's": "its", "they're": "theyre", "they've": "theyve", "we're": "were",
"we've": "weve", "we'll": "well", "they'll": "theyll", "isn't": "isnt", "aren't": "arent", "wasn't": "wasnt",
"weren't": "werent", "hasn't": "hasnt", "haven't": "havent", "hadn't": "hadnt", "won't": "wont", "wouldn't": "wouldnt",
"shouldn't": "shouldnt", "can't": "cant", "couldn't": "couldnt", "don't": "dont", "doesn't": "doesnt", "didn't": "didnt",
"cardi b": "cardib"
}

text
# Replace contractions with expanded forms
pattern = re.compile(r'\b(' + '|'.join(contractions.keys()) + r')\b')
expanded_text = pattern.sub(lambda x: contractions[x.group()], text)

# Remove apostrophes from other instances
expanded_text = re.sub(r"'", '', expanded_text)  # Remove all apostrophes
return expanded_text

#Function to cound word in a lyric for a song for analysis
def countword(df):
all_words = []
for lyrics in df['Lyric']:
expanded_lyrics = expand_contractions(lyrics.lower()) # Expand contractions first
words = wordpunct_tokenize(expanded_lyrics) # Tokenize
all_words.extend(words)
return all_words

function to analyze song lyrics sentiment value using SenticNet

def analyze_sentiment(word_list):
# Initialize SenticNet
sn = SenticNet()

text
# Dictionary to store polarity and primary emotion for each word
sentiment_data = {}

# Variable to count words not found in SenticNet
count_of_word_not_in_senticNet = 0

# Loop over each word in the list
for word in word_list:
    try:
        # Attempt to get SenticNet data for the word
        sn_data = sn.concept(word)

        mood_tags = sn.moodtags(word)
        first_mood_tag = mood_tags[0]
        second_mood_tag = mood_tags[1]
        
        if sn_data:
            
            # Extract polarity and primary mood from SenticNet data
            polarity = sn_data.get('polarity_value', 'N/A')

            # Store the sentiment data in the dictionary
            sentiment_data[word] = {
                'polarity': polarity,
                'primary_mood': first_mood_tag,
                'secondary_mood': second_mood_tag
            }
    except Exception as e:
        # Increment the count if an error occurs
        count_of_word_not_in_senticNet += 1

return sentiment_data, count_of_word_not_in_senticNet

fixed_moods based on all emotion in sectinet6

fixed_moods = {
"#eagerness": 0,
"#calmness": 0,
"#joy": 0,
"#pleasantess": 0,
"#disgust": 0,
"#sadness": 0,
"#anger": 0,
"#fear": 0
}

def songSentiment_analyzer_by_word_primary_emotion(artist):
# filter the song by Artist
Artist_df = df_main[df_main.Artist == artist]

text
# Get the total number of songs
total_songs = Artist_df.shape[0]
#print(f"Total number of songs: {total_rows}")

song_mood_category = fixed_moods.copy()

number_of_word_not_in_artist_lyric = 0
for song in range(total_songs):
    row_ = Artist_df.iloc[song]
    text = row_.words
    sentiment_analysis, count = analyze_sentiment(text)

    mood_count = fixed_moods.copy()

    for word, data in sentiment_analysis.items():
        primary_emotion = data['primary_mood']

        if primary_emotion in mood_count:
            mood_count[primary_emotion] += 1
        pass
        #print(f"Word: {word} - Polarity: {data['polarity']} - Primary Emotion: {data['primary_mood']}")

    #find the most frequent mood that all word in a song represent
    highest_mood = max(mood_count, key=mood_count.get)

    if highest_mood in song_mood_category:
        song_mood_category[highest_mood] += 1
    number_of_word_not_in_artist_lyric += count

#print(song_mood_category)
#print(f"Words not found in SenticNet for all song by {artist}: {number_of_word_not_in_artist_lyric}")
#print("")
return total_songs, song_mood_category, number_of_word_not_in_artist_lyric

polarity_list = []

def songSentiment_analyzer_by_polarity_averaging(artist):
# filter the song by Artist
Artist_df = df_main[df_main.Artist == artist]

text
# Get the total number of songs
total_songs = Artist_df.shape[0]
#print(f"Total number of songs: {total_rows}")

polarity_list_for_avgSONG = polarity_list.copy()
avg_polarity_list_for_artist = polarity_list.copy()

number_of_word_not_in_artist_lyric = 0
for song in range(total_songs): #total_songs
    row_ = Artist_df.iloc[song]
    text = row_.words
    sentiment_analysis, count = analyze_sentiment(text)

    for word, data in sentiment_analysis.items():
        polarity = data['polarity']
        polarity = float(polarity) #senticnet polarity value is in string

        #print(polarity)
        polarity_list_for_avgSONG.append(polarity)

    average_polarity_song = np.mean(polarity_list_for_avgSONG)
    #print(average_polarity_song)
    avg_polarity_list_for_artist.append(average_polarity_song)

avg_polarity_for_artist = np.mean(avg_polarity_list_for_artist)

return avg_polarity_for_artist

pass

artists = ['Ariana Grande', 'Beyoncé', 'Billie Eilish', 'Cardi B', 'Charlie Puth', 'Coldplay', 'Drake', 'Dua Lipa',
'Ed Sheeran', 'Eminem', 'Justin Bieber', 'Katy Perry', 'Khalid', 'Lady Gaga', 'Maroon 5', 'Nicki Minaj',
'Post Malone', 'Rihanna', 'Selena Gomez', 'Taylor Swift']

Initialize a list to hold results

df_secticnet = []

for artist in artists:
# Perform sentiment analysis for the current artist
total_songs, song_mood_category, number_of_word_not_in_artist_lyric = songSentiment_analyzer_by_word_primary_emotion(artist)

text
# Create a dictionary with the artist's data
artist_data = {
    "Artist": artist,
    "Total Songs": total_songs,
    "Words Not Found": number_of_word_not_in_artist_lyric
}

# Add mood categories to the dictionary
artist_data.update(song_mood_category)

# Append the artist's data to the list
df_secticnet.append(artist_data)

Create a DataFrame from the list of dictionaries

df_senticnet = pd.DataFrame(df_secticnet)

Display the DataFrame

print(df_senticnet)

Initialize a list to hold results

df_secticnet_polarity = []

for artist in artists:
# Perform sentiment analysis for the current artist
avg_polarity_for_artist = songSentiment_analyzer_by_polarity_averaging(artist)

text
# Create a dictionary with the artist's data
artist_data = {
    "Artist": artist,
    "Average Polarity Score": avg_polarity_for_artist
}

# Append the artist's data to the list
df_secticnet_polarity.append(artist_data)

Create a DataFrame from the list of dictionaries

df_secticnet_polarity = pd.DataFrame(df_secticnet_polarity)

Display the DataFrame

print(df_secticnet_polarity)

Visualization using Matplotlib

plt.figure(figsize=(12, 6))
plt.bar(df_secticnet_polarity["Artist"], df_secticnet_polarity["Average Polarity Score"], color='skyblue')
plt.xticks(rotation=45, ha='right') # Rotate artist names for better readability
plt.xlabel("Artists")
plt.ylabel("Average Polarity Score")
plt.title("Average Polarity Score by Artist")
plt.tight_layout()

Show the plot

plt.show()

##############################################################################

DATA STATISTICS

##############################################################################

1) BASIC DESCRIPTIVE STATISTICS FOR THE ENTIRE DATASET

print("========================================")
print(" BASIC DATA STATISTICS ")
print("========================================")
print(f"Total number of songs in df_main: {len(df_main)}")
print(f"Number of unique artists: {df_main['Artist'].nunique()}")
print(f"Year range: {df_main['Year'].min()} to {df_main['Year'].max()}")
print("========================================\n")

2) AVERAGE WORD COUNT (LYRIC LENGTH)

First, compute lyric length as number of tokens in each song lyric

df_main['lyric_length'] = df_main['Lyric'].apply(
lambda x: len(x.split()) if isinstance(x, str) else 0
)

avg_lyric_length_all = df_main['lyric_length'].mean()
median_lyric_length_all = df_main['lyric_length'].median()
std_lyric_length_all = df_main['lyric_length'].std()

print("========================================")
print(" LYRIC LENGTH STATISTICS (ALL SONGS) ")
print("========================================")
print(f"Average lyric length (words): {avg_lyric_length_all:.2f}")
print(f"Median lyric length (words): {median_lyric_length_all:.2f}")
print(f"Standard Deviation: {std_lyric_length_all:.2f}")
print("========================================\n")

3) GROUP-BY ARTIST STATISTICS

artist_stats = df_main.groupby('Artist').agg({
'Year': ['mean', 'min', 'max'],
'lyric_length': ['mean', 'min', 'max']
})

Rename columns for clarity

artist_stats.columns = [
'Avg_Year', 'Min_Year', 'Max_Year',
'Avg_LyricLength', 'Min_LyricLength', 'Max_LyricLength'
]

print("========================================")
print(" STATISTICS GROUPED BY ARTIST ")
print("========================================")
print(artist_stats)
print("========================================\n")

4) CORRELATION ANALYSIS (OPTIONAL)

For example, we can look at whether newer songs tend to have more words.

This is just one possible correlation. You can do many more!

corr_year_lyriclength = df_main[['Year', 'lyric_length']].corr().iloc[0, 1]

print("========================================")
print(" CORRELATION ANALYSIS ")
print("========================================")
print(f"Correlation between Year and Lyric Length: {corr_year_lyriclength:.4f}")
print("========================================\n")

5) ADDITIONAL VISUALIZATIONS (OPTIONAL)

Example: Plot of Year vs. Average lyric length per year

Just to demonstrate how you might visualize the correlation.

year_grouped = df_main.groupby('Year')['lyric_length'].mean().reset_index()

plt.figure(figsize=(10, 5))
plt.plot(year_grouped['Year'], year_grouped['lyric_length'], marker='o', linestyle='-', color='purple')
plt.title("Average Lyric Length Over the Years")
plt.xlabel("Year")
plt.ylabel("Average Lyric Length (words)")
plt.grid(True)
plt.show()

can you tell me the flow of this program? such as data collection etc.

Answer 1

Below is a high-level breakdown of the flow in this program. Each step explains what is happening and how the data is processed, from collection to analysis and visualization.

1) Data Collection and Loading

Directory Setup
A base directory (base_dir = r'Song_csv') is defined, which points to where all the CSV files with song data are stored.
List of Artists
A list of artist names is specified (e.g., ArianaGrande, Beyonce, …, TaylorSwift). The program then builds file paths for each artist by combining the base directory and the artist name.
Reading CSV Files
Each CSV file (ArtistName.csv) is read into a pandas DataFrame. This results in a list of DataFrames, one for each artist.

2) Data Cleaning and Preprocessing

Cleaning Function (cleaning(df))
1. Filter Out Non-Original Songs
  Removes rows whose titles contain words like “remix,” “live,” “demo,” etc., so that only original versions remain.
2. Remove Nulls and Duplicates
  Drops rows where the Title or Lyric is NaN, and also removes exact duplicate Titles and Lyrics.
Lemmatization & Stopword Removal (lyrics_to_words(document))
1. Converts lyrics to lowercase.
2. Removes stopwords (common words like “the,” “and,” “is,” etc.) using NLTK’s stopword list.
3. Removes punctuation using a set of punctuation characters from the string module.
4. Uses a lemmatizer to transform each word into its base (lemma) form.
Extract Unique Words (uniqueWords(df))
After cleaning, each song’s lyrics are converted into a list of unique words (stored under df['words']).
Concatenate Cleaned Data
After cleaning each artist’s DataFrame, all the cleaned DataFrames are concatenated into one main DataFrame (df_main).
1. Duplicate rows (by Title and Lyric) are dropped again to ensure uniqueness.
2. Only songs from valid years (1900 to 2024) are retained.
3. Irrelevant index columns (e.g., Unnamed: 0) are dropped.
Handle Contractions (expand_contractions(text))
Certain contractions are replaced with simpler forms (e.g., "i'm" -> "im", "don't" -> "dont") to reduce the total number of variant words.
Count Words for Possible Analysis (countword(df))
A helper function to tokenize each lyric and optionally count words (though in this flow, it’s not used as a final output, but could be leveraged for further analysis).

3) Sentiment Analysis

3.1) Using SenticNet

analyze_sentiment(word_list)
1. For each word in a lyric, the program attempts to retrieve the polarity value (a float in the range $-1, 1$ ) and two mood tags (primary and secondary emotions) from the SenticNet library.
2. Words that do not exist in SenticNet are counted separately.
3. Returns a dictionary keyed by word, containing:
  - polarity (e.g., 0.25, -0.5, etc.)
  - primary_mood (e.g., #joy, #anger, …)
  - secondary_mood (e.g., #sadness, …)

Analyzing by Primary Emotion

songSentiment_analyzer_by_word_primary_emotion(artist)
1. Filters the main DataFrame to get only the songs of the specified artist.
2. For each song in the artist’s DataFrame, it calls analyze_sentiment(...) on the lyric’s word list.
3. Tally the primary emotion (e.g., #joy, #fear) of all the words in that one song, then find the “highest” mood (the one that occurs most frequently).
4. Increment a global count for that mood (so effectively we get “song-level mood” classification).
5. Also keeps track of how many words for that artist are not found in SenticNet.
6. Returns:
  - total_songs (the number of songs for that artist)
  - song_mood_category (the final tally of each mood, e.g., how many songs are mostly #joy, #sadness, etc.)
  - number_of_word_not_in_artist_lyric (the total words not found in SenticNet across all songs)

Analyzing by Polarity (Averaging)

songSentiment_analyzer_by_polarity_averaging(artist)
1. Filters songs by the specified artist.
2. For each song, retrieves each word’s polarity from SenticNet and stores it in a list.
3. Computes an average polarity (mean) for each song.
4. Then computes the average of these per-song averages, yielding one overall polarity value for the artist.
5. Returns the average polarity for the entire artist’s discography.

Storing the Results

Moods & Polarity DataFrames
1. The loop over all artists calls:
  - songSentiment_analyzer_by_word_primary_emotion(artist)
    - Outputs a dictionary of mood counts per artist.
  - songSentiment_analyzer_by_polarity_averaging(artist)
    - Outputs the average polarity per artist.
2. Results are appended into two separate DataFrames:
  - df_senticnet (mood analysis, one row per artist)
  - df_secticnet_polarity (polarity analysis, one row per artist)
Visualization (Matplotlib)
A bar chart is plotted showing the average polarity score for each artist.

4) Descriptive Statistics

After the main sentiment analysis, the code prints out several data statistics:

Basic Data Statistics
- Total number of songs in the consolidated df_main.
- Number of unique artists.
- The overall range of years.
Lyric Length Statistics (All Songs)
- Each song’s lyric length is calculated by splitting on whitespace.
- Prints the mean, median, and standard deviation of lyric length across the entire dataset.
Group-by Artist Statistics
- For each artist, shows:
  - Average, min, and max Year
  - Average, min, and max lyric_length
Correlation Analysis
- Looks at the correlation between the song’s Year and lyric_length.
- Prints out the correlation coefficient.
Additional Visualization
- A line plot of average lyric length by year, showing how it changes over time.

Summary of the Entire Flow

Collect/Read Data from CSV files for each artist.
Clean & Preprocess each DataFrame (drop non-original versions, remove nulls, duplicates, normalize text).
Combine all cleaned DataFrames into a single unified DataFrame (df_main).
Analyze Sentiment via SenticNet:
- Moods/emotions per artist (by “song-level dominant emotion”).
- Polarity (overall averaged polarity values per artist).
Visualize Results for polarity scores in a bar chart.
Compute & Print Descriptive Statistics (count of songs, lyric length stats, correlation, etc.).
Optional Visualization of how lyric length changes by year.

This integrated approach provides both sentiment insights (emotions, polarity) and statistical insights (lyric length, correlation with year, etc.) on the artists’ discographies.

Question

Sentiment Analysis

Suppress warnings

Initialize SenticNet and VADER Sentiment Analyzer

Define the base directory

List of artist names

Build the list of file paths

Print file paths (optional)

Read CSV files into DataFrame list

Preprocessing function

Remove any song that are not the original version of an artist's songs

Change lyrics into word_list that is easier to be process during analysis

such as, making all lyrics in lowercase to make lyrics not case-sensitive

remove stop word to de-clutter the dataframe

using lemmatazation to only use base word

Extract unique words from lyric in a song

Main pipeline

dataframe that is clean

df_main is the dataframe that is cleaned (df_allclean)

Remove any song that exist outside the span of 1900 and 2024

remove the index column (unnamed) from csv as it is not needed

Define a function to handle contractions

Here, word like i'm is not considered as I am but iam, this is to reduce the amout of word for each lyrics

and not to stray far from the original form of the song

function to analyze song lyrics sentiment value using SenticNet

fixed_moods based on all emotion in sectinet6

Initialize a list to hold results

Create a DataFrame from the list of dictionaries

Display the DataFrame

Initialize a list to hold results

Create a DataFrame from the list of dictionaries

Display the DataFrame

Visualization using Matplotlib

Show the plot

DATA STATISTICS

1) BASIC DESCRIPTIVE STATISTICS FOR THE ENTIRE DATASET

2) AVERAGE WORD COUNT (LYRIC LENGTH)

First, compute lyric length as number of tokens in each song lyric

3) GROUP-BY ARTIST STATISTICS

Rename columns for clarity

4) CORRELATION ANALYSIS (OPTIONAL)

For example, we can look at whether newer songs tend to have more words.

This is just one possible correlation. You can do many more!

5) ADDITIONAL VISUALIZATIONS (OPTIONAL)

Example: Plot of Year vs. Average lyric length per year

Just to demonstrate how you might visualize the correlation.

Thought

Answer

1) Data Collection and Loading

2) Data Cleaning and Preprocessing

3) Sentiment Analysis

3.1) Using SenticNet

Analyzing by Primary Emotion

Analyzing by Polarity (Averaging)

Storing the Results

4) Descriptive Statistics

Summary of the Entire Flow

Share this Q&A