Skip to content

Dataset

Dataset Page

The dataset page (Visible using left side bar) provides a dataset analytics UI and data download platform.

Dataset Page Documentation

Monitor

Monitor class serves as a dataset monitor

Attributes:

Name Type Description
history_name str

Name of the history dataset file name

tracks_name str

Name of the tracks dataset file name

root_path Path

Path to the root of the project

hist_path Path

Path to the history dataset

track_path Path

Path to the tracks dataset

history DataFrame

History dataset as a dataframe

tracks DataFrame

Track dataset as a dataframe

Source code in src/pages/dataset.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
class Monitor:
    """Monitor class serves as a dataset monitor

    Attributes:
        history_name (str): Name of the history dataset file name
        tracks_name (str): Name of the tracks dataset file name
        root_path (Path): Path to the root of the project
        hist_path (Path): Path to the history dataset
        track_path (Path): Path to the tracks dataset
        history (DataFrame): History dataset as a dataframe
        tracks (DataFrame): Track dataset as a dataframe
    """
    def __init__(self):
        """Method constructs the dataset monitor for data analysis within this page"""
        self.history_name = 'dataset_growth.csv'
        self.tracks_name = 'tracks.csv'
        self.root_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
        self.hist_path = os.path.join(self.root_path, 'data', self.history_name)
        self.track_path = os.path.join(self.root_path, 'data', self.tracks_name)

        self.history = pd.read_csv(self.hist_path)
        self.tracks = pd.read_csv(self.track_path)

        self.history['date'] = pd.to_datetime(self.history['date'], format="%d-%m-%Y")  # Format the date
        self.history['time'] = pd.to_datetime(self.history['time'], format='%H:%M:%S')  # Format the time

    def determine_date_range(self):
        """Method determines the date range in the dataset history file.

        Returns:
            start (datetime): The earliest date available in the dataset history file.
            end (datetime): The latest date available in the dataset history file.
        """
        return self.history['date'].min(), self.history['date'].max()

    def access_acoustic_sample_features(self):
        """Method samples the tracks dataset and selects acoustic features for visualization in the pairplot.

        Note, sampling is used here as the size of the tracks dataset would take a long time to render in a pairplot.

        Returns:
            (DataFrame): Sampled tracks dataframe containing select acoustic features.
        """
        track_sample = self.tracks.sample(frac=0.1, random_state=1)
        acoustics_df = track_sample[['danceability', 'energy', 'loudness', 'speechiness',
                                    'acousticness', 'instrumentalness']]
        return acoustics_df

    def access_specific_features(self, selection: list, sample=True):
        """Method allows for access to specific acoustic features in the tracks dataset

        Args:
            selection (list): A list of acoustic features
            sample (bool): If True, the features must be sampled. If False, the feature is not sampled.

        Returns:
            (DataFrame): A dataframe containing the specified feature.
        """
        if sample:
            track_sample = self.tracks.sample(frac=0.1, random_state=2)
        else:
            track_sample = self.tracks
        return track_sample[selection]

    def access_artist_names(self):
        """Method determines the unique artist names in the tracks dataset.

        Returns:
            (list): A list of unique artists available in the tracks dataset.
        """
        names = self.tracks['artist_names'].unique().tolist()
        return names

    def access_feature_definitions(self):
        """Method allows for the `data/feature_def` file to be read-in providing feature definitions

        Returns:
            (str): Feature definitions
        """
        def_file = os.path.join(self.root_path, 'data', 'feature_def.txt')
        with open(def_file, 'r') as file:
            contents = file.read()
            return contents

__init__()

Method constructs the dataset monitor for data analysis within this page

Source code in src/pages/dataset.py
22
23
24
25
26
27
28
29
30
31
32
33
34
def __init__(self):
    """Method constructs the dataset monitor for data analysis within this page"""
    self.history_name = 'dataset_growth.csv'
    self.tracks_name = 'tracks.csv'
    self.root_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
    self.hist_path = os.path.join(self.root_path, 'data', self.history_name)
    self.track_path = os.path.join(self.root_path, 'data', self.tracks_name)

    self.history = pd.read_csv(self.hist_path)
    self.tracks = pd.read_csv(self.track_path)

    self.history['date'] = pd.to_datetime(self.history['date'], format="%d-%m-%Y")  # Format the date
    self.history['time'] = pd.to_datetime(self.history['time'], format='%H:%M:%S')  # Format the time

access_acoustic_sample_features()

Method samples the tracks dataset and selects acoustic features for visualization in the pairplot.

Note, sampling is used here as the size of the tracks dataset would take a long time to render in a pairplot.

Returns:

Type Description
DataFrame

Sampled tracks dataframe containing select acoustic features.

Source code in src/pages/dataset.py
45
46
47
48
49
50
51
52
53
54
55
56
def access_acoustic_sample_features(self):
    """Method samples the tracks dataset and selects acoustic features for visualization in the pairplot.

    Note, sampling is used here as the size of the tracks dataset would take a long time to render in a pairplot.

    Returns:
        (DataFrame): Sampled tracks dataframe containing select acoustic features.
    """
    track_sample = self.tracks.sample(frac=0.1, random_state=1)
    acoustics_df = track_sample[['danceability', 'energy', 'loudness', 'speechiness',
                                'acousticness', 'instrumentalness']]
    return acoustics_df

access_artist_names()

Method determines the unique artist names in the tracks dataset.

Returns:

Type Description
list

A list of unique artists available in the tracks dataset.

Source code in src/pages/dataset.py
74
75
76
77
78
79
80
81
def access_artist_names(self):
    """Method determines the unique artist names in the tracks dataset.

    Returns:
        (list): A list of unique artists available in the tracks dataset.
    """
    names = self.tracks['artist_names'].unique().tolist()
    return names

access_feature_definitions()

Method allows for the data/feature_def file to be read-in providing feature definitions

Returns:

Type Description
str

Feature definitions

Source code in src/pages/dataset.py
83
84
85
86
87
88
89
90
91
92
def access_feature_definitions(self):
    """Method allows for the `data/feature_def` file to be read-in providing feature definitions

    Returns:
        (str): Feature definitions
    """
    def_file = os.path.join(self.root_path, 'data', 'feature_def.txt')
    with open(def_file, 'r') as file:
        contents = file.read()
        return contents

access_specific_features(selection, sample=True)

Method allows for access to specific acoustic features in the tracks dataset

Parameters:

Name Type Description Default
selection list

A list of acoustic features

required
sample bool

If True, the features must be sampled. If False, the feature is not sampled.

True

Returns:

Type Description
DataFrame

A dataframe containing the specified feature.

Source code in src/pages/dataset.py
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
def access_specific_features(self, selection: list, sample=True):
    """Method allows for access to specific acoustic features in the tracks dataset

    Args:
        selection (list): A list of acoustic features
        sample (bool): If True, the features must be sampled. If False, the feature is not sampled.

    Returns:
        (DataFrame): A dataframe containing the specified feature.
    """
    if sample:
        track_sample = self.tracks.sample(frac=0.1, random_state=2)
    else:
        track_sample = self.tracks
    return track_sample[selection]

determine_date_range()

Method determines the date range in the dataset history file.

Returns:

Name Type Description
start datetime

The earliest date available in the dataset history file.

end datetime

The latest date available in the dataset history file.

Source code in src/pages/dataset.py
36
37
38
39
40
41
42
43
def determine_date_range(self):
    """Method determines the date range in the dataset history file.

    Returns:
        start (datetime): The earliest date available in the dataset history file.
        end (datetime): The latest date available in the dataset history file.
    """
    return self.history['date'].min(), self.history['date'].max()

artist_matching(artists)

Method performs a regex search against all artists in the database, using the artists search query.

Parameters:

Name Type Description Default
artists str

A search pattern of artists names (expect the string to be of format name,name,name with no whitespaces)

required

Returns: (list): A list of matching artist names which the user can then select from.

Source code in src/pages/dataset.py
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
def artist_matching(artists: str):
    """Method performs a regex search against all artists in the database, using the artists search query.

    Args:
        artists (str): A search pattern of artists names (expect the string to be of format name,name,name with no whitespaces)
    Returns:
        (list): A list of matching artist names which the user can then select from.
    """
    options = []
    artist_names = re.split(r',|\|', artists)
    pattern = re.compile(fr"\b(?:{'|'.join(map(re.escape, artist_names))})\b", re.IGNORECASE)
    names = st.session_state.monitor.access_artist_names()
    for name in names:
        matches = re.findall(pattern, name)
        if matches:
            options.append(name)
    return options

create_feature_selection(maximum=12)

Method creates a streamlit multiselection capability, in which a user can select acoustic featurs to be visualized

Parameters:

Name Type Description Default
maximum int

The maximum number of features that can be selected.

12

Returns:

Type Description
list

A list of selected feature names

Source code in src/pages/dataset.py
120
121
122
123
124
125
126
127
128
129
130
131
132
def create_feature_selection(maximum=12):
    """Method creates a streamlit multiselection capability, in which a user can select acoustic featurs to be visualized

    Args:
        maximum (int): The maximum number of features that can be selected.

    Returns:
        (list): A list of selected feature names
    """
    options = ['artist_pop', 'track_pop', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
               'instrumentalness', 'liveness', 'valences', 'durations_ms', 'tempos']
    selected_option = st.multiselect('Select features', options, default='loudness', max_selections=maximum)
    return selected_option

datasets_download_section()

Method prepares the tracks.csv and the dataset_growth.csv files for download.

Source code in src/pages/dataset.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
def datasets_download_section():
    """Method prepares the `tracks.csv` and the `dataset_growth.csv` files for download."""
    # Tracks download
    with open(st.session_state.monitor.track_path, 'rb') as file:
        data = file.read()

    st.download_button(
        label='Click to download tracks dataset',
        data=data,
        file_name='tracks.csv',
        key='download_dataset'
    )

    # Growth download
    with open(st.session_state.monitor.hist_path, 'rb') as history_file:
        data_hist = history_file.read()

    st.download_button(
        label='Click to download tracks growth dataset',
        data=data_hist,
        file_name='dataset_growth.csv',
        key='download_dataset_growth'
    )

date_sliders(start, end)

Method creates the dataset growth date sliders, in order to determine the start and end date for visualizatin

Parameters:

Name Type Description Default
start datetime

Earliest available date in the history dataset

required
end dadtetime

Latest available date in the history dataset.

required

Returns:

Name Type Description
start_date datetime

The slider selected start date

end_date datetime

The slider selected end date

Source code in src/pages/dataset.py
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
def date_sliders(start, end):
    """Method creates the dataset growth date sliders, in order to determine the start and end date for visualizatin

    Args:
        start (datetime): Earliest available date in the history dataset
        end (dadtetime): Latest available date in the history dataset.

    Returns:
        start_date (datetime): The slider selected start date
        end_date (datetime): The slider selected end date
    """
    start_date = st.slider(label='Select start date',
                           min_value=start.to_pydatetime(),
                           max_value=end.to_pydatetime())
    end_date = st.slider(label='Select end date',
                         min_value=start.to_pydatetime(),
                         max_value=end.to_pydatetime(),
                         value=max_date.to_pydatetime())
    return start_date, end_date

generate_artist_comparison(selection, artist_filter)

Method generates a swarmplot enabling artist comparison across various acoustic features. Args: selection (list): A list of features, such that their distributions will be visually compared. artist_filter (list): A filter of artist names, if the distributions

Returns:

Type Description
PyPlot Figure

A swarmplot showing comparable artist acoustic features

Source code in src/pages/dataset.py
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
def generate_artist_comparison(selection: list, artist_filter: list):
    """Method generates a swarmplot enabling artist comparison across various acoustic features.
    Args:
        selection (list): A list of features, such that their distributions will be visually compared.
        artist_filter (list): A filter of artist names, if the distributions

    Returns:
        (PyPlot Figure): A swarmplot showing comparable artist acoustic features
    """
    selection.append('artist_names')
    df = st.session_state.monitor.access_specific_features(selection, sample=False)
    df = df[df['artist_names'].isin(artist_filter)]  # Filter df to keep arist only tracks
    selection.remove('artist_names')

    scaler = MinMaxScaler(feature_range=(-1, 1))  # Normalize the data between [-1, 1] for visual purposes
    columns = selection
    df[columns] = scaler.fit_transform(df[columns])

    sns.set()
    fig, ax = plt.subplots(figsize=(10, 6))
    h = sns.swarmplot(data=df, x=selection[0], y='artist_names', hue='artist_names', palette="Set2", legend=False)
    return fig

generate_distribution(selection)

Method generates feature distibutions to allow feature comparisons.

Note, all features are normalized within the same range [-1, 1] for comparative visualization purposes.

Parameters:

Name Type Description Default
selection list

A list of features, such that their distributions will be visually compared.

required

Returns:

Type Description
PyPlot Figure

A figure showcasing the various acoustic feature distributions

Source code in src/pages/dataset.py
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
def generate_distribution(selection: list):
    """Method generates feature distibutions to allow feature comparisons.

    Note, all features are normalized within the same range [-1, 1] for comparative visualization purposes.

    Args:
        selection (list): A list of features, such that their distributions will be visually compared.

    Returns:
        (PyPlot Figure): A figure showcasing the various acoustic feature distributions
    """
    df = st.session_state.monitor.access_specific_features(selection)

    scaler = MinMaxScaler(feature_range=(-1, 1))  # Normalize the data between [-1, 1] for visual purposes
    columns = selection
    df[columns] = scaler.fit_transform(df[columns])

    sns.set()
    fig, ax = plt.subplots(figsize=(10, 6))

    for feature in selection:  # Overlay the feature distributions
        h = sns.kdeplot(data=df, x=feature, label=feature, fill=True)
    ax.set_title('Acoustic Feature Distribution')
    ax.set_xlabel('Value')
    ax.set_ylabel('Density')
    ax.legend()

    return fig

generate_growth_plot(start, end)

This method generates a line plot showcasing the number of unique tracks in the dataset over time, showcasing its growth

Parameters:

Name Type Description Default
start datetime

The starting datetime to visualize the dataset growth

required
end datetime

The end datetime to visualize the dataset growth

required

Returns:

Type Description
PyPlot Figure

A line plot figure showcasing the dataset growth over time

Source code in src/pages/dataset.py
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
@st.cache_resource
def generate_growth_plot(start, end):
    """This method generates a line plot showcasing the number of unique tracks in the dataset over time, showcasing its growth

    Args:
        start (datetime): The starting datetime to visualize the dataset growth
        end (datetime): The end datetime to visualize the dataset growth

    Returns:
        (PyPlot Figure): A line plot figure showcasing the dataset growth over time
    """
    df = st.session_state.monitor.history
    history_filtered = df[(df['date'] >= start) & (df['date'] <= end)]

    sns.set()
    fig, axes = plt.subplots(1, 1, figsize=(12, 6))
    h = sns.lineplot(data=history_filtered, x='date', y='track_count', color='black')
    axes.fill_between(history_filtered['date'], history_filtered['track_count'], alpha=0.2, color='red')

    axes.set_xlabel('Date')
    axes.set_ylabel('Track Count')

    return fig

generate_pair_plot()

This function generates a pair plot showcasing the relationship between acoustic analysis features of the tracks in the dataset.

Returns:

Type Description
PyPlot Figure

A pyplot figure showcasing the acoustic feature relationships

Source code in src/pages/dataset.py
179
180
181
182
183
184
185
186
187
188
189
190
@st.cache_resource
def generate_pair_plot():
    """This function generates a pair plot showcasing the relationship between acoustic analysis features of the tracks in the dataset.

    Returns:
        (PyPlot Figure): A pyplot figure showcasing the acoustic feature relationships
    """
    df = st.session_state.monitor.access_acoustic_sample_features()

    sns.set()
    g = sns.pairplot(df, diag_kind='kde')
    return g.fig