Song Data Cleaning Process
Créé le : 11 janvier 2025
Créé le : 11 janvier 2025
#NLTK library for natural language processing
import nltk
nltk.download('vader_lexicon')
from nltk.tokenize import word_tokenize
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from senticnet.senticnet import SenticNet
#Dataset management/data handling
import pandas as pd
import numpy as np #for calculation
import string
import re
#Visualization
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
sn = SenticNet()
vader = SentimentIntensityAnalyzer()
import os #for user convenience
######### Change based on your path here ##########
base_dir = r'Song_csv'
artists = [
'ArianaGrande', 'Beyonce', 'BillieEilish', 'CardiB', 'CharliePuth',
'ColdPlay', 'Drake', 'DuaLipa', 'EdSheeran', 'Eminem', 'JustinBieber',
'KatyPerry', 'Khalid', 'LadyGaga', 'Maroon5', 'NickiMinaj',
'PostMalone', 'Rihanna', 'SelenaGomez', 'TaylorSwift'
]
file_paths = [os.path.join(base_dir, f"{artist}.csv") for artist in artists]
for file in file_paths:
print(f"File path: {file}")
df_all = [pd.read_csv(file) for file in file_paths]
####### Data preparation ###########
def cleaning(df):
a=[]
i=0
df1=df
title = df['Title']
for t in df['Title']:
r=Re=l=Li=c=m=V=ve=D=rs=edit=Edit=0
r=t.find('remix')
Re=t.find('Remix')
l=t.find('live')
Li=t.find('Live')
V=t.find('Version')
ve=t.find('version')
D=t.find('Demo ')
D=t.find('Demo')
rs=t.find('Reprise')
c=t.find('COPY')
m=t.find('Mix')
edit=t.find('edit')
Edit=t.find('Edit')
if r != -1:
a.append(t)
elif Re != -1:
a.append(t)
elif l != -1:
a.append(t)
elif Li != -1:
a.append(t)
elif V != -1:
a.append(t)
elif ve != -1:
a.append(t)
elif D != -1:
a.append(t)
elif rs != -1:
a.append(t)
elif c != -1:
a.append(t)
elif m != -1:
a.append(t)
elif edit != -1:
a.append(t)
elif Edit != -1:
a.append(t)
textfor t1 in df['Title']: for t2 in a: if t1 == t2: df1=df1.drop(i) i=i+1 df1.dropna(subset = ["Title"], inplace=True) df1.dropna(subset = ["Lyric"], inplace=True) df1.drop_duplicates(subset ="Title",keep = False, inplace = True) df1.drop_duplicates(subset ="Lyric",keep = False, inplace = True) return df1
def lyrics_to_words(document):
if not isinstance(document, str):
return "" # Handle non-string input
textstop_words = set(stopwords.words('english')) exclude = set(string.punctuation) lemma = WordNetLemmatizer() stopwordremoval = " ".join([i for i in document.lower().split() if i not in stop_words]) punctuationremoval = ''.join(ch for ch in stopwordremoval if ch not in exclude) normalized = " ".join(lemma.lemmatize(word) for word in punctuationremoval.split()) return normalized
def uniqueWords(df):
def unique(list1):
return list(set(list1)) # Simplified using set
textwords = [] for word in df['Lyric']: processed_lyric = lyrics_to_words(word) words.append(unique(processed_lyric.split())) df['words'] = words return df
df_allclean = []
for df in df_all:
df_cleaned = cleaning(df)
df_word_unique = uniqueWords(df_cleaned)
df_allclean.append(df_word_unique)
frames = df_allclean
df_main = pd.concat(frames,ignore_index=True)
df_main.drop_duplicates(subset=['Title', 'Lyric'], inplace=True)
df_main['Year'] = pd.to_numeric(df_main['Year'], errors='coerce') # Ensure all values are numeric
df_main['Year'] = df_main['Year'].fillna(0) # Replace NaN with 0
df_main['Year'] = df_main['Year'].astype(int) # Convert to integer
df_main = df_main[(df_main['Year'] >= 1900) & (df_main['Year'] <= 2024)]
for col in ['Unnamed: 0', 'Unnamed: 0.1']:
if col in df_main.columns:
df_main = df_main.drop(col, axis=1)
df_main=df_main.reset_index(drop=True)
def expand_contractions(text):
contractions = {
"i'm": "im", "i've": "ive", "i'll": "ill", "i'd": "id", "you're": "youre", "you've": "youve", "you'll": "youll",
"he's": "hes", "she's": "shes", "it's": "its", "they're": "theyre", "they've": "theyve", "we're": "were",
"we've": "weve", "we'll": "well", "they'll": "theyll", "isn't": "isnt", "aren't": "arent", "wasn't": "wasnt",
"weren't": "werent", "hasn't": "hasnt", "haven't": "havent", "hadn't": "hadnt", "won't": "wont", "wouldn't": "wouldnt",
"shouldn't": "shouldnt", "can't": "cant", "couldn't": "couldnt", "don't": "dont", "doesn't": "doesnt", "didn't": "didnt",
"cardi b": "cardib"
}
text# Replace contractions with expanded forms pattern = re.compile(r'\b(' + '|'.join(contractions.keys()) + r')\b') expanded_text = pattern.sub(lambda x: contractions[x.group()], text) # Remove apostrophes from other instances expanded_text = re.sub(r"'", '', expanded_text) # Remove all apostrophes return expanded_text
#Function to cound word in a lyric for a song for analysis
def countword(df):
all_words = []
for lyrics in df['Lyric']:
expanded_lyrics = expand_contractions(lyrics.lower()) # Expand contractions first
words = wordpunct_tokenize(expanded_lyrics) # Tokenize
all_words.extend(words)
return all_words
def analyze_sentiment(word_list):
# Initialize SenticNet
sn = SenticNet()
text# Dictionary to store polarity and primary emotion for each word sentiment_data = {} # Variable to count words not found in SenticNet count_of_word_not_in_senticNet = 0 # Loop over each word in the list for word in word_list: try: # Attempt to get SenticNet data for the word sn_data = sn.concept(word) mood_tags = sn.moodtags(word) first_mood_tag = mood_tags[0] second_mood_tag = mood_tags[1] if sn_data: # Extract polarity and primary mood from SenticNet data polarity = sn_data.get('polarity_value', 'N/A') # Store the sentiment data in the dictionary sentiment_data[word] = { 'polarity': polarity, 'primary_mood': first_mood_tag, 'secondary_mood': second_mood_tag } except Exception as e: # Increment the count if an error occurs count_of_word_not_in_senticNet += 1 return sentiment_data, count_of_word_not_in_senticNet
fixed_moods = {
"#eagerness": 0,
"#calmness": 0,
"#joy": 0,
"#pleasantess": 0,
"#disgust": 0,
"#sadness": 0,
"#anger": 0,
"#fear": 0
}
def songSentiment_analyzer_by_word_primary_emotion(artist):
# filter the song by Artist
Artist_df = df_main[df_main.Artist == artist]
text# Get the total number of songs total_songs = Artist_df.shape[0] #print(f"Total number of songs: {total_rows}") song_mood_category = fixed_moods.copy() number_of_word_not_in_artist_lyric = 0 for song in range(total_songs): row_ = Artist_df.iloc[song] text = row_.words sentiment_analysis, count = analyze_sentiment(text) mood_count = fixed_moods.copy() for word, data in sentiment_analysis.items(): primary_emotion = data['primary_mood'] if primary_emotion in mood_count: mood_count[primary_emotion] += 1 pass #print(f"Word: {word} - Polarity: {data['polarity']} - Primary Emotion: {data['primary_mood']}") #find the most frequent mood that all word in a song represent highest_mood = max(mood_count, key=mood_count.get) if highest_mood in song_mood_category: song_mood_category[highest_mood] += 1 number_of_word_not_in_artist_lyric += count #print(song_mood_category) #print(f"Words not found in SenticNet for all song by {artist}: {number_of_word_not_in_artist_lyric}") #print("") return total_songs, song_mood_category, number_of_word_not_in_artist_lyric
polarity_list = []
def songSentiment_analyzer_by_polarity_averaging(artist):
# filter the song by Artist
Artist_df = df_main[df_main.Artist == artist]
text# Get the total number of songs total_songs = Artist_df.shape[0] #print(f"Total number of songs: {total_rows}") polarity_list_for_avgSONG = polarity_list.copy() avg_polarity_list_for_artist = polarity_list.copy() number_of_word_not_in_artist_lyric = 0 for song in range(total_songs): #total_songs row_ = Artist_df.iloc[song] text = row_.words sentiment_analysis, count = analyze_sentiment(text) for word, data in sentiment_analysis.items(): polarity = data['polarity'] polarity = float(polarity) #senticnet polarity value is in string #print(polarity) polarity_list_for_avgSONG.append(polarity) average_polarity_song = np.mean(polarity_list_for_avgSONG) #print(average_polarity_song) avg_polarity_list_for_artist.append(average_polarity_song) avg_polarity_for_artist = np.mean(avg_polarity_list_for_artist) return avg_polarity_for_artist pass
artists = ['Ariana Grande', 'Beyoncé', 'Billie Eilish', 'Cardi B', 'Charlie Puth', 'Coldplay', 'Drake', 'Dua Lipa',
'Ed Sheeran', 'Eminem', 'Justin Bieber', 'Katy Perry', 'Khalid', 'Lady Gaga', 'Maroon 5', 'Nicki Minaj',
'Post Malone', 'Rihanna', 'Selena Gomez', 'Taylor Swift']
df_secticnet = []
for artist in artists:
# Perform sentiment analysis for the current artist
total_songs, song_mood_category, number_of_word_not_in_artist_lyric = songSentiment_analyzer_by_word_primary_emotion(artist)
text# Create a dictionary with the artist's data artist_data = { "Artist": artist, "Total Songs": total_songs, "Words Not Found": number_of_word_not_in_artist_lyric } # Add mood categories to the dictionary artist_data.update(song_mood_category) # Append the artist's data to the list df_secticnet.append(artist_data)
df_senticnet = pd.DataFrame(df_secticnet)
print(df_senticnet)
df_secticnet_polarity = []
for artist in artists:
# Perform sentiment analysis for the current artist
avg_polarity_for_artist = songSentiment_analyzer_by_polarity_averaging(artist)
text# Create a dictionary with the artist's data artist_data = { "Artist": artist, "Average Polarity Score": avg_polarity_for_artist } # Append the artist's data to the list df_secticnet_polarity.append(artist_data)
df_secticnet_polarity = pd.DataFrame(df_secticnet_polarity)
print(df_secticnet_polarity)
plt.figure(figsize=(12, 6))
plt.bar(df_secticnet_polarity["Artist"], df_secticnet_polarity["Average Polarity Score"], color='skyblue')
plt.xticks(rotation=45, ha='right') # Rotate artist names for better readability
plt.xlabel("Artists")
plt.ylabel("Average Polarity Score")
plt.title("Average Polarity Score by Artist")
plt.tight_layout()
plt.show()
##############################################################################
##############################################################################
print("========================================")
print(" BASIC DATA STATISTICS ")
print("========================================")
print(f"Total number of songs in df_main: {len(df_main)}")
print(f"Number of unique artists: {df_main['Artist'].nunique()}")
print(f"Year range: {df_main['Year'].min()} to {df_main['Year'].max()}")
print("========================================\n")
df_main['lyric_length'] = df_main['Lyric'].apply(
lambda x: len(x.split()) if isinstance(x, str) else 0
)
avg_lyric_length_all = df_main['lyric_length'].mean()
median_lyric_length_all = df_main['lyric_length'].median()
std_lyric_length_all = df_main['lyric_length'].std()
print("========================================")
print(" LYRIC LENGTH STATISTICS (ALL SONGS) ")
print("========================================")
print(f"Average lyric length (words): {avg_lyric_length_all:.2f}")
print(f"Median lyric length (words): {median_lyric_length_all:.2f}")
print(f"Standard Deviation: {std_lyric_length_all:.2f}")
print("========================================\n")
artist_stats = df_main.groupby('Artist').agg({
'Year': ['mean', 'min', 'max'],
'lyric_length': ['mean', 'min', 'max']
})
artist_stats.columns = [
'Avg_Year', 'Min_Year', 'Max_Year',
'Avg_LyricLength', 'Min_LyricLength', 'Max_LyricLength'
]
print("========================================")
print(" STATISTICS GROUPED BY ARTIST ")
print("========================================")
print(artist_stats)
print("========================================\n")
corr_year_lyriclength = df_main[['Year', 'lyric_length']].corr().iloc[0, 1]
print("========================================")
print(" CORRELATION ANALYSIS ")
print("========================================")
print(f"Correlation between Year and Lyric Length: {corr_year_lyriclength:.4f}")
print("========================================\n")
year_grouped = df_main.groupby('Year')['lyric_length'].mean().reset_index()
plt.figure(figsize=(10, 5))
plt.plot(year_grouped['Year'], year_grouped['lyric_length'], marker='o', linestyle='-', color='purple')
plt.title("Average Lyric Length Over the Years")
plt.xlabel("Year")
plt.ylabel("Average Lyric Length (words)")
plt.grid(True)
plt.show()
can you tell me the flow of this program? such as data collection etc.
Below is a high-level breakdown of the flow in this program. Each step explains what is happening and how the data is processed, from collection to analysis and visualization.
Directory Setup
A base directory (base_dir = r'Song_csv'
) is defined, which points to where all the CSV files with song data are stored.
List of Artists
A list of artist names is specified (e.g., ArianaGrande
, Beyonce
, …, TaylorSwift
). The program then builds file paths for each artist by combining the base directory and the artist name.
Reading CSV Files
Each CSV file (ArtistName.csv
) is read into a pandas DataFrame
. This results in a list of DataFrames, one for each artist.
Cleaning Function (cleaning(df)
)
NaN
, and also removes exact duplicate Titles and Lyrics.Lemmatization & Stopword Removal (lyrics_to_words(document)
)
string
module.Extract Unique Words (uniqueWords(df)
)
After cleaning, each song’s lyrics are converted into a list of unique words (stored under df['words']
).
Concatenate Cleaned Data
After cleaning each artist’s DataFrame, all the cleaned DataFrames are concatenated into one main DataFrame (df_main
).
Unnamed: 0
) are dropped.Handle Contractions (expand_contractions(text)
)
Certain contractions are replaced with simpler forms (e.g., "i'm" -> "im"
, "don't" -> "dont"
) to reduce the total number of variant words.
Count Words for Possible Analysis (countword(df)
)
A helper function to tokenize each lyric and optionally count words (though in this flow, it’s not used as a final output, but could be leveraged for further analysis).
analyze_sentiment(word_list)
#joy
, #anger
, …)#sadness
, …)songSentiment_analyzer_by_word_primary_emotion(artist)
analyze_sentiment(...)
on the lyric’s word list.#joy
, #fear
) of all the words in that one song, then find the “highest” mood (the one that occurs most frequently).#joy
, #sadness
, etc.)songSentiment_analyzer_by_polarity_averaging(artist)
Moods & Polarity DataFrames
songSentiment_analyzer_by_word_primary_emotion(artist)
songSentiment_analyzer_by_polarity_averaging(artist)
df_senticnet
(mood analysis, one row per artist)df_secticnet_polarity
(polarity analysis, one row per artist)Visualization (Matplotlib)
A bar chart is plotted showing the average polarity score for each artist.
After the main sentiment analysis, the code prints out several data statistics:
Basic Data Statistics
df_main
.Lyric Length Statistics (All Songs)
Group-by Artist Statistics
Year
lyric_length
Correlation Analysis
Year
and lyric_length
.Additional Visualization
df_main
).This integrated approach provides both sentiment insights (emotions, polarity) and statistical insights (lyric length, correlation with year, etc.) on the artists’ discographies.