Skip to content

Data processing

Data Processing

This file forms the basis of Spotify data processing.

Data Processing Documentation

add_playlist_tracking(name, store)

Method assigns the playlist name to each of the tracks to allow for traceback

Parameters:

Name Type Description Default
name str

The name of the playlist

required
store dict

The object in which to store extracted information

required
Source code in src/data_processing.py
293
294
295
296
297
298
299
300
def add_playlist_tracking(name, store):
    """Method assigns the playlist name to each of the tracks to allow for traceback

    Args:
        name (str): The name of the playlist
        store (dict): The object in which to store extracted information
    """
    store['playlist_name'] = [name] * len(store['uris'])

construct_storage()

Method constructs the storage dictionary in which collected track information is stored during collection, and enables saving as a csv file.

Returns:

Type Description
dict

An dictionary with each of the features initialized as keys, with associated empty list values.

Source code in src/data_processing.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
def construct_storage():
    """Method constructs the storage dictionary in which collected track information is stored during collection,
    and enables saving as a csv file.

    Returns:
        (dict): An dictionary with each of the features initialized as keys, with associated empty list values.
    """
    store_outline = {
        'uris': [],
        'names': [],
        'artist_names': [],
        'artist_uris': [],
        'artist_pop': [],
        'artist_genres': [],
        'albums': [],
        'track_pop': [],
        'danceability': [],
        'energy': [],
        'keys': [],
        'loudness': [],
        'modes': [],
        'speechiness': [],
        'acousticness': [],
        'instrumentalness': [],
        'liveness': [],
        'valences': [],
        'tempos': [],
        'types': [],
        'ids': [],
        'track_hrefs': [],
        'analysis_urls': [],
        'durations_ms': [],
        'time_signatures': [],
        'playlist_name': []
    }
    return store_outline

extract_artist_info(store, sp)

Method deals with extracting artist information from the artists() API call through Spotipy

Parameters:

Name Type Description Default
store dict

The object in which to store extracted information

required
sp Spotipy Authorization

The authorized spotipy credentials object

required
Source code in src/data_processing.py
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
def extract_artist_info(store, sp):
    """Method deals with extracting artist information from the `artists()` API call through Spotipy

    Args:
        store (dict): The object in which to store extracted information
        sp (Spotipy Authorization): The authorized spotipy credentials object
    """
    limit = 50
    offset = 0
    while offset < len(store['artist_uris']):
        if offset + limit > len(store['artist_uris']):  # If else, deals with batching
            artists_info = sp.artists(store['artist_uris'][offset: len(store['artist_uris'])])  # Gather artis info through API
        else:
            artists_info = sp.artists(store['artist_uris'][offset: offset + limit])  # Gather artis info through API

        for artist in artists_info['artists']:  # Extract popularity and genres from each artist
            store['artist_pop'].append(artist['popularity'])  # Access artist popularity
            store['artist_genres'].append(artist['genres'])  # Access artist genres

        offset = offset + limit

extract_audio_features(store, sp)

Method deal with extracting audio analysis features for a given batch of tracks

Parameters:

Name Type Description Default
store dict

The object in which to store extracted information

required
sp Spotipy Authorization

The authorized spotipy credentials object

required
Source code in src/data_processing.py
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
def extract_audio_features(store, sp):
    """Method deal with extracting audio analysis features for a given batch of tracks

    Args:
        store (dict): The object in which to store extracted information
        sp (Spotipy Authorization): The authorized spotipy credentials object
    """
    limit = 100
    offset = 0
    while offset < len(store['uris']):
        if offset + limit > len(store['uris']):   # If else, deals with batching of acoustic features
            track_info = sp.audio_features(store['uris'][offset: len(store['uris'])])
        else:
            track_info = sp.audio_features(store['uris'][offset: offset + limit])

        for track in track_info:  # For each track extract the necessary features and store it
            store['danceability'].append(track['danceability'])
            store['energy'].append(track['energy'])
            store['keys'].append(track['key'])
            store['loudness'].append(track['loudness'])
            store['modes'].append(track['mode'])
            store['speechiness'].append(track['speechiness'])
            store['acousticness'].append(track['acousticness'])
            store['instrumentalness'].append(track['instrumentalness'])
            store['liveness'].append(track['liveness'])
            store['valences'].append(track['valence'])
            store['tempos'].append(track['tempo'])
            store['types'].append(track['type'])
            store['ids'].append(track['id'])
            store['track_hrefs'].append(track['track_href'])
            store['analysis_urls'].append(track['analysis_url'])
            store['durations_ms'].append(track['duration_ms'])
            store['time_signatures'].append(track['time_signature'])

        offset = offset + limit

extract_tracks(sp, playlist_uri, store)

Method deals with extracting tracks from a given playlist Note, this method forms the cornerstone of extraction, providing track access from a playlist.

Parameters:

Name Type Description Default
sp Spotipy Authorization

The authorized spotipy credentials object

required
playlist_uri str

The URI of the Spotify playlist

required
store dict

The object in which to store extracted information

required
Source code in src/data_processing.py
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
def extract_tracks(sp, playlist_uri, store):
    """Method deals with extracting tracks from a given playlist
    Note, this method forms the cornerstone of extraction, providing track access from a playlist.

    Args:
        sp (Spotipy Authorization): The authorized spotipy credentials object
        playlist_uri (str): The URI of the Spotify playlist
        store (dict): The object in which to store extracted information
    """
    offset = 0
    limit = 100
    playlist = sp.playlist_tracks(playlist_uri, limit=2, offset=offset)  # Retrieve the initial batch of songs
    total_songs = playlist['total']  # Extract the total number of songs

    while offset < total_songs:
        time.sleep(2)
        playlist = sp.playlist_tracks(playlist_uri, limit=100, offset=offset)  # Retrieve batch of songs in playlist
        store = retrieve_batch_info(playlist, store)  # Retrieve batch information
        offset = offset + limit  # Update offset

    extract_artist_info(store, sp)  # Extract the artist features for each track
    extract_audio_features(store, sp)  # Extract the audio features for each track

find_top_playlists(sp, country)

Method finds the top-20 playlists in a given country

Parameters:

Name Type Description Default
sp Spotipy Authorization

The authorized spotipy credentials object

required
country str

The ISO 3166-1 alpha-2 country code of where the playlist should be extracted from.

required

Returns:

Name Type Description
uris list

A list of uris linking to each of the found playlists

names list

A related list of playlist names corresponding to the uris

Source code in src/data_processing.py
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
def find_top_playlists(sp, country):
    """Method finds the top-20 playlists in a given country

    Args:
        sp (Spotipy Authorization): The authorized spotipy credentials object
        country (str): The ISO 3166-1 alpha-2 country code of where the playlist should be extracted from.

    Returns:
        uris (list): A list of uris linking to each of the found playlists
        names (list): A related list of playlist names corresponding to the uris
    """
    uris = []
    names = []
    playlists = sp.featured_playlists(country=country, limit=20)
    playlist_items = playlists['playlists']['items']

    for item in playlist_items:  # Extract the uri and name from each playlist
        uris.append(item['uri'].split(':')[-1])
        names.append(item['name'])
    return uris, names

merge_stores(tracks_store, store)

Method deals with merging one store into another Args: store (dict): The track storage object to be merged. tracks_store (dict): The larger object in which to merge store into

Source code in src/data_processing.py
237
238
239
240
241
242
243
244
def merge_stores(tracks_store, store):
    """Method deals with merging one store into another
    Args:
        store (dict): The track storage object to be merged.
        tracks_store (dict): The larger object in which to merge store into
    """
    for key, value in store.items():
        tracks_store[key].extend(value)

process_items(store, items)

This method extracts the information from the Spotipy tracks() API call. Information includes: - Track uri - Track name - Track album - Track popularity

Parameters:

Name Type Description Default
store dict

The object in which to store extracted information

required
items list

The list of track information generated from the Spotipy tracks API call

required

Returns:

Type Description
dict

An updated store of information

Source code in src/data_processing.py
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
def process_items(store, items):
    """This method extracts the information from the Spotipy `tracks()` API call.
    Information includes:
    - Track uri
    - Track name
    - Track album
    - Track popularity

    Args:
        store (dict): The object in which to store extracted information
        items (list): The list of track information generated from the Spotipy tracks API call

    Returns:
        (dict): An updated store of information

    """
    for item in items:
        track_uri = item['track']['uri'].split(':')[-1]  # Extract track uri only from the provided link
        store['uris'].append(track_uri)  # Retrieve track uri
        store['names'].append(item['track']['name'])  # Retrieve track name

        store['artist_uris'].append(item['track']['artists'][0]['uri'].split(':')[-1])  # Find artist uri

        store['artist_names'].append(item['track']['artists'][0]['name'])  # Access artist name

        store['albums'].append(item['track']['album']['name'])  # Access album names
        store['track_pop'].append(item['track']['popularity'])  # Access track popularity
    return store

retrieve_batch_info(playlist, store)

Method retrieves the essential information from the batch API call to enable simplified extraction

Returns:

Type Description
dict

A partially updated store of track information. This required further data extraction.

Source code in src/data_processing.py
137
138
139
140
141
142
143
144
145
def retrieve_batch_info(playlist, store):
    """Method retrieves the essential information from the batch API call to enable simplified extraction

    Returns:
        (dict): A partially updated store of track information. This required further data extraction.
    """
    items = playlist['items']  # Extract items (A list containing information on the tracks)
    store = process_items(store, items)  # Extract info and store it.
    return store

save_data(tracks_store, name='tracks.csv')

Method deals with saving collected track data

Note, this method removes all duplicate tracks, such that all tracks within the dataset are unique, always keeping most up-to-date representation of each track.

Parameters:

Name Type Description Default
tracks_store dict

The dictionary containing all information extracted about the tracks

required
name str

The name of the file to save the information to. Default is the tracks.csv dataset file.

'tracks.csv'
Source code in src/data_processing.py
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
def save_data(tracks_store, name='tracks.csv'):
    """Method deals with saving collected track data

    Note, this method removes all duplicate tracks, such that all tracks within the dataset are unique, always keeping
    most up-to-date representation of each track.

    Args:
        tracks_store (dict): The dictionary containing all information extracted about the tracks
        name (str): The name of the file to save the information to. Default is the tracks.csv dataset file.

    """
    root_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
    file_path = os.path.join(root_path, 'data', name)
    df_new = pd.DataFrame.from_dict(tracks_store)  # Create a dataframe from the collected data
    df_old = pd.read_csv(file_path, index_col=0)  # Create dataframe from old values

    if df_old.shape[0] != 0 and name == "tracks.csv":  # Previously saved songs, requiring further processing to have unique values only
        df_combined = pd.concat([df_new, df_old], axis=0)
        df_unique = df_combined.drop_duplicates(subset='uris', keep='first')  # Drop duplicates (keeping most up to date)
        df_unique = df_unique.reset_index(drop=True)
        df_unique.to_csv(file_path, mode='w')
    else:
        df_new.to_csv(file_path, mode='w')

target_playlist_extraction(sp, url, name)

This method extracts all track information from a given target playlist Args: sp (Spotipy Authorization): The authorized spotipy credentials object url (str): The url of the playlist from which to extract information name (str): The name of the playlist

Returns:

Type Description
dict

A dictionary containing all features and information pertaining to the target playlist.

Source code in src/data_processing.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
def target_playlist_extraction(sp, url, name):
    """This method extracts all track information from a given target playlist
    Args:
        sp (Spotipy Authorization): The authorized spotipy credentials object
        url (str): The url of the playlist from which to extract information
        name (str): The name of the playlist

    Returns:
        (dict): A dictionary containing all features and information pertaining to the target playlist.
    """
    uri = url2uri(url)  # Extract the uri
    store = construct_storage()  # Create the info storage
    extract_tracks(sp, uri, store)  # Extract track information
    add_playlist_tracking(name, store)  # Add playlist information (name)
    save_data(store, 'target.csv')  # Save the data (Update tracks.csv) dataset
    return store

top_playlist_extraction(sp)

Method extracts the tracks in the 20 top-performing playlists from a selection of countries The countries include: Australia, UK, USA, Canada, Jamaica, South Africa

This method does not return any information, but stores it in the tracks.csv dataset file.

Parameters:

Name Type Description Default
sp Spotipy Authorization

The authorized spotipy credentials object

required
Source code in src/data_processing.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
def top_playlist_extraction(sp):
    """Method extracts the tracks in the 20 top-performing playlists from a selection of countries
     The countries include: Australia, UK, USA, Canada, Jamaica, South Africa

     This method does not return any information, but stores it in the tracks.csv dataset file.

     Args:
         sp (Spotipy Authorization): The authorized spotipy credentials object
    """
    countries = ['AU', 'GB', 'US', 'CA', 'JM', 'ZA']

    tracks_store = construct_storage()  # Construct track info storage

    for country in countries:  # Iterate through countries
        print(f'Country: {country}')
        top_playlists, names = find_top_playlists(sp, country)  # Find top 20 playlists in each country

        for playlist, name in zip(top_playlists[:], names[:]):  # Iterate through playlists
            try:
                print(f'Playlist name: {name}')
                store = construct_storage()
                extract_tracks(sp, playlist, store)
                add_playlist_tracking(name, store)
                merge_stores(tracks_store, store)  # Merge the playlist information, by merging the data stored.
                time.sleep(2)  # Respect APi limits through a forced sleep
            except Exception:
                print(f"Error accessing playlist {name} tracks")
        print('-----------------------------------------------------------------------------')

    save_data(tracks_store)  # Save the data

update_tracking(df)

Method updates the dataset_growth.csv file when new tracks are added to the dataset to record dataset growth

Parameters:

Name Type Description Default
df DataFrame

The dataframe containing all stored tracks, including new additions

required
Source code in src/data_processing.py
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
def update_tracking(df):
    """Method updates the `dataset_growth.csv` file when new tracks are added to the dataset to record dataset growth

    Args:
        df (DataFrame): The dataframe containing all stored tracks, including new additions
    """
    root_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))  # Read in the growth dataset
    file_path = os.path.join(root_path, 'data', 'dataset_growth.csv')
    tracking_df = pd.read_csv(file_path, index_col=0)

    current_time = datetime.now()  # Extract new features to update the growth dataset with
    new_length = df.shape[0]
    new_entry = pd.DataFrame.from_dict({'date': [current_time.strftime("%d-%m-%Y")],
                                        'time': [current_time.strftime("%H:%M:%S")],
                                        'track_count': [new_length]})

    tracking_df = pd.concat([tracking_df, new_entry], axis=0, ignore_index=True)  # Update growth dataset
    tracking_df.reset_index(drop=True)
    tracking_df.to_csv(file_path, mode='w')

url2uri(url)

Method extracts the uri from a given Spotify url

Parameters:

Name Type Description Default
url str

The Spotify playlist url

required

Returns: (str): The uri of the Spotify playlist

Source code in src/data_processing.py
31
32
33
34
35
36
37
38
39
def url2uri(url):
    """Method extracts the uri from a given Spotify url

    Args:
        url (str): The Spotify playlist url
    Returns:
        (str): The uri of the Spotify playlist
    """
    return url.split('/')[-1].split('?')[0]