<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article
  PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1 20151215//EN" "https://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">
<article article-type="research-article" dtd-version="1.1" specific-use="sps-1.9" xml:lang="en" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
	<front>
		<journal-meta>
			<journal-id journal-id-type="publisher-id">ijeph</journal-id>
			<journal-title-group>
				<journal-title>Interdisciplinary Journal of Epidemiology and Public Health</journal-title>
				<abbrev-journal-title abbrev-type="publisher">Interdiscipl. J. Epidemiol. Public Health</abbrev-journal-title>
			</journal-title-group>
			<issn pub-type="ppub">2665-427X</issn>
			<publisher>
				<publisher-name>Facultad Ciencias de la Salud, Universidad Libre</publisher-name>
			</publisher>
		</journal-meta>
		<article-meta>
			<article-id pub-id-type="doi">10.18041/2665-427X/ijeph.1.11532</article-id>
			<article-categories>
				<subj-group subj-group-type="heading">
					<subject>Original article</subject>
				</subj-group>
			</article-categories>
			<title-group>
				<article-title>Enhancing Obesity Prediction through SMOTE-based Classification Models: A Comparative Study</article-title>
				<trans-title-group xml:lang="es">
					<trans-title>Aumento de la predicción de la obesidad mediante modelos de clasificación basados en SMOTE: Un estudio comparativo</trans-title>
				</trans-title-group>
			</title-group>
			<contrib-group>
				<contrib contrib-type="author">
					<contrib-id contrib-id-type="orcid">0009-0004-7919-2064</contrib-id>
					<name>
						<surname>Kamwele Mutinda</surname>
						<given-names>John</given-names>
					</name>
					<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
				</contrib>
				<contrib contrib-type="author">
					<contrib-id contrib-id-type="orcid">0000-0002-7813-6835</contrib-id>
					<name>
						<surname>Langat</surname>
						<given-names>Amos Kipkorir</given-names>
					</name>
					<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
				</contrib>
				<contrib contrib-type="author">
					<contrib-id contrib-id-type="orcid">0009-0004-5252-759X</contrib-id>
					<name>
						<surname>Marcel Djaha</surname>
						<given-names>Regis Konan</given-names>
					</name>
					<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
				</contrib>
				<contrib contrib-type="author">
					<contrib-id contrib-id-type="orcid">0009-0004-6017-4691</contrib-id>
					<name>
						<surname>Ndoto Munyao</surname>
						<given-names>Jackson</given-names>
					</name>
					<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
				</contrib>
				<contrib contrib-type="author">
					<contrib-id contrib-id-type="orcid">0009-0005-3408-637X</contrib-id>
					<name>
						<surname>Whitaker</surname>
						<given-names>Lee</given-names>
					</name>
					<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
				</contrib>
				<contrib contrib-type="author">
					<contrib-id contrib-id-type="orcid">0009-0008-8422-7427</contrib-id>
					<name>
						<surname>Auma Omondi</surname>
						<given-names>Millicent</given-names>
					</name>
					<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
				</contrib>
			</contrib-group>
			<aff id="aff1">
				<label>1</label>
				<institution content-type="original"> University of Science and Technology of China, Langfang, Hebei, China</institution>
				<institution content-type="orgname">University of Science and Technology of China</institution>
				<addr-line>
					<city>Langfang</city>
					<state>Hebei</state>
				</addr-line>
				<country country="CN">China</country>
			</aff>
			<aff id="aff2">
				<label>2</label>
				<institution content-type="original"> Department of Mathematics, Technology and Innovation-JKUAT, Pan African University Institute for Basic Sciences, Nairobi, Kenya</institution>
				<institution content-type="orgdiv2">Department of Mathematics</institution>
				<institution content-type="orgdiv1">Technology and Innovation-JKUAT</institution>
				<institution content-type="orgname">Pan African University Institute for Basic Sciences</institution>
				<addr-line>
					<city>Nairobi</city>
				</addr-line>
				<country country="KE">Kenya</country>
			</aff>
			<aff id="aff3">
				<label>3</label>
				<institution content-type="original"> Basque Center for Applied Mathematics, Bilbao, Basque, Spain</institution>
				<institution content-type="orgname">Basque Center for Applied Mathematics</institution>
				<addr-line>
					<city>Bilbao</city>
					<state>Basque</state>
				</addr-line>
				<country country="ES">Spain</country>
			</aff>
			<aff id="aff4">
				<label>4</label>
				<institution content-type="original"> African Institute for Mathematical Sciences, Limbe, Cameroon </institution>
				<institution content-type="orgname">African Institute for Mathematical Sciences</institution>
				<addr-line>
					<city>Limbe</city>
				</addr-line>
				<country country="CM">Cameroon</country>
			</aff>
			<aff id="aff5">
				<label>5 </label>
				<institution content-type="original">South Eastern Kenya University, Kitui County, Kenya </institution>
				<institution content-type="orgname">South Eastern Kenya University</institution>
				<addr-line>
					<city>Kitui County</city>
				</addr-line>
				<country country="KE">Kenya</country>
			</aff>
			<author-notes>
				<corresp id="c1">
					<label>Correspondence author:</label> Amos Kipkorir Langat. E-mail: <email>moskiplangat@gmail.com</email>
				</corresp>
				<fn fn-type="other" id="fn1">
					<label>Author contributions:</label>
					<p> John Kamwele Mutinda: Writing -Original draft, Review, Software, Methodology, Data curation, Conceptualization. Amos Kipkorir Langat: Writing-Review, Methodology, Data curation and Conceptualization Regis Konan Marcel Djaha: Writing-Review, Methodology, Data curation and Conceptualization Jackson Ndoto Munyao: Writing -Original draft and Review and Software. Lee Whitaker Fundi Ireri: Writing-Review, Methodology, Data curation and Conceptualization. Millicent Auma Omondi: Methodology, Data curation and Conceptualization</p>
				</fn>
				<fn fn-type="conflict" id="fn3">
					<label>Conflicts of interest:</label>
					<p> The authors declare no conflicts of interest.</p>
				</fn>
			</author-notes>
			<pub-date date-type="pub" publication-format="electronic">
				<day>30</day>
				<month>06</month>
				<year>2024</year>
			</pub-date>
			<pub-date date-type="collection" publication-format="electronic">
				<season>Jan-Jun</season>
				<year>2024</year>
			</pub-date>
			<volume>7</volume>
			<issue>1</issue>
			<elocation-id>e-11532</elocation-id>
			<history>
				<date date-type="received">
					<day>18</day>
					<month>03</month>
					<year>2024</year>
				</date>
				<date date-type="accepted">
					<day>30</day>
					<month>06</month>
					<year>2024</year>
				</date>
			</history>
			<permissions>
				<license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by-nc-nd/4.0/" xml:lang="en">
					<license-p>This is an open-access article distributed under the terms of the Creative Commons Attribution License</license-p>
				</license>
			</permissions>
			<abstract>
				<title>Abstract</title>
				<sec>
					<title>Objective:</title>
					<p> To use SMOTE to enhance class balance and compare the performance of different classification methods before and after applying SMOTE. </p>
				</sec>
				<sec>
					<title>Methods:</title>
					<p> Study used a dataset obtained from Kaggle, which consisted of several health-related features linked to obesity prediction. The design involved checking for class imbalance within the dataset, which affected initial model performance. SMOTE was applied to synthetically increase the representation of minority classes, effectively reducing the class imbalance. The experiment was conducted in two stages: 1. Training and testing the classification algorithms before applying SMOTE. 2. Training and testing the same models after applying SMOTE to enhance class balance. The performance of all models was evaluated based on metrics before and after the SMOTE application.</p>
				</sec>
				<sec>
					<title>Results:</title>
					<p> Initially, models like Logistic Regression and Naive Bayes struggled with low sensitivity and specificity, KNN (<italic>k</italic>=5) showed poor specificity. Significant improvements were observed across all models after applying SMOTE. Logistic Regression, despite a decrease in accuracy (-8.8), sensitivity and specificity increased substantially (+56.7%), with balanced accuracy improving (+16.6%). Naive Bayes saw a modest accuracy increase (+2.3%), with sensitivity and specificity improving (+47.9%). The KNN (<italic>k</italic>=5) classifier exhibited a transformative enhancement with sensitivity and specificity increasing (+96.0%) and balanced accuracy (+28.3%). Deep Learning showed a significant increase in sensitivity (+69.8%), balanced accuracy (+29.4%), and an improvement in precision and F1-score despite a slight decrease in specificity (-10.9%). </p>
				</sec>
				<sec>
					<title>Conclusion: </title>
					<p>The study highlights the importance of SMOTE in healthcare applications, contributing to more accurate predictions and reliable healthcare decision-making. The results demonstrate that while there might be slight trade-offs, the overall improvements in key metrics such as sensitivity, specificity, balanced accuracy, precision, and F1-score affirm the utility of SMOTE in enhancing model performance for imbalanced datasets.</p>
				</sec>
			</abstract>
			<trans-abstract xml:lang="es">
				<title>Resumen</title>
				<sec>
					<title>Objetivo:</title>
					<p> Utilizar Técnica de Sobre muestreo de Minorías Sintéticas (SMOTE) para mejorar el equilibrio de clases y comparar el rendimiento de distintos métodos de clasificación antes y después de aplicar SMOTE</p>
				</sec>
				<sec>
					<title>Métodos:</title>
					<p> Los métodos de clasificación fueron Regresión Logística, Naive Bayes, KNN (k=5) y Aprendizaje Profundo. Cada modelo fue entrenado y probado en el conjunto de datos, antes y después de aplicar SMOTE. Se utilizaron las métricas de evaluación: Precisión, Sensibilidad, Especificidad, Precisión equilibrada, Puntuación F1.</p>
				</sec>
				<sec>
					<title>Resultados: </title>
					<p>Modelos como la regresión logística y Naive Bayes tuvieron problemas con sensibilidad y especificidad bajas, KNN (k=5) mostró una especificidad deficiente. Con SMOTE, se observaron mejoras significativas en todos los modelos. La regresión logística, a pesar de una disminución de la precisión (-8.8), la sensibilidad y la especificidad aumentaron sustancialmente (+56.7%), y mejoró la precisión equilibrada (+16.6%). Naive Bayes experimentó un modesto aumento de la precisión (+2.3%), mejoró la sensibilidad y la especificidad (+47.9%). El clasificador KNN mostró una mejora transformadora con aumento de la sensibilidad, la especificidad (+96.0%) y precisión equilibrada (+28.3%). El aprendizaje profundo mostró aumento significativo de sensibilidad (+69.8%), exactitud equilibrada (+29.4%) y una mejora notable de la precisión y la puntuación F1 a pesar de un ligero descenso de la especificidad (-10.9%).</p>
				</sec>
				<sec>
					<title>Conclusiones:</title>
					<p> SMOTE contribuye a realizar predicciones más exactas y fiables. Aunque puede haber ligeras desventajas, las mejoras generales en las métricas usadas confirman la utilidad de SMOTE para mejorar el rendimiento de los modelos en conjuntos de datos desequilibrados.</p>
				</sec>
			</trans-abstract>
			<kwd-group xml:lang="en">
				<title>Keywords:</title>
				<kwd>Obesity</kwd>
				<kwd>classification algorithms</kwd>
				<kwd>synthetic minority over-sampling technique (smote)</kwd>
				<kwd>imbalanced data</kwd>
				<kwd>healthcare decision-making</kwd>
			</kwd-group>
			<kwd-group xml:lang="es">
				<title>Palabras clave:</title>
				<kwd>Obesidad</kwd>
				<kwd>algoritmos</kwd>
				<kwd>Técnica de Sobremuestreo de Minorías Sintéticas (smote)</kwd>
				<kwd>datos desbalanceados</kwd>
				<kwd>toma de decisiones en salud</kwd>
			</kwd-group>
			<counts>
				<fig-count count="9"/>
				<table-count count="5"/>
				<equation-count count="16"/>
				<ref-count count="39"/>
				<page-count count="0"/>
			</counts>
		</article-meta>
	</front>
	<body>
		<boxed-text id="bx1">
			<sec>
				<title>Key study facts</title>
				<p>
					<table-wrap id="t1">
						<table>
							<colgroup>
								<col/>
								<col/>
							</colgroup>
							<tbody>
								<tr>
									<td align="left">Objective:</td>
									<td align="left">To assess the impact of the Synthetic Minority Over-sampling Technique (SMOTE) on class imbalance in predicting obesity, using multiple classification algorithms </td>
								</tr>
								<tr>
									<td align="left">Study design:</td>
									<td align="left">The study used a dataset obtained from Kaggle, which consisted of several health-related features linked to obesity prediction. The design involved checking for class imbalance within the dataset, which affected initial model performance. SMOTE was applied to synthetically increase the representation of minority classes, effectively reducing the class imbalance. The experiment was conducted in two stages: 1. Training and testing the classification algorithms before applying SMOTE. 2. Training and testing the same models after applying SMOTE to enhance class balance. Performance of all models was evaluated based on metrics before and after SMOTE application </td>
								</tr>
								<tr>
									<td align="left">Source of information:</td>
									<td align="left">The dataset used for this study was obtained from Kaggle, a platform known for providing a wide variety of datasets for data analysis and machine learning tasks. The dataset includes multiple health-related attributes such as BMI, physical activity, etc.</td>
								</tr>
								<tr>
									<td align="left">Statistical analysis</td>
									<td align="left">The classification methods utilized in this study were Logistic Regression, Naïve Bayes, KNN (k=5), and Deep Learning. Each model was trained and tested on the dataset before and after applying SMOTE. The following evaluation metrics were used: • Accuracy: The proportion of correct predictions out of total predictions. • Sensitivity (Recall): The model’s ability to correctly identify positive cases. • Specificity: The model’s ability to correctly identify negative cases. • Balanced Accuracy: The average of sensitivity and specificity. • Precision: The proportion of positive predictions that are positive. • F1-Score: The harmonic mean of precision and recall, providing a balance between the two.</td>
								</tr>
								<tr>
									<td align="left">Principle findings</td>
									<td align="left">The application of SMOTE significantly enhanced model performance. Key findings include: • Logistic Regression: Despite a -8.8% decrease in accuracy, sensitivity and specificity increased by +56.7%, leading to a +16.6% improvement in balanced accuracy. • Naive Bayes: A modest accuracy increase of +2.3% was observed, with sensitivity and specificity improving by +47.9%. • KNN (k=5): The classifier showed the most significant improvement, with sensitivity and specificity increasing by +96.0% and balanced accuracy improving by +28.3%. • Deep Learning: Sensitivity increased by +69.8%, balanced accuracy by +29.4%, and precision and F1-score improved, despite a slight decrease in specificity by -10.9%. These results demonstrate that SMOTE effectively mitigates class imbalance issues, improving model performance, particularly in sensitivity and balanced accuracy, across all tested algorithms. The study highlights the utility of SMOTE in healthcare prediction models, where accurate identification of minority classes is crucial for decision-making </td>
								</tr>
							</tbody>
						</table>
					</table-wrap>
				</p>
			</sec>
		</boxed-text>
		<sec sec-type="intro">
			<title>Introduction</title>
			<p>Obesity has become increasingly prevalent worldwide and is now a major contributor to poor health, surpassing undernutrition, and infectious diseases <xref ref-type="bibr" rid="B1"><sup>1</sup></xref><sup>,</sup><xref ref-type="bibr" rid="B2"><sup>2</sup></xref>. It is linked to various serious health conditions such as diabetes, heart disease, cancer, and sleep disorders <xref ref-type="bibr" rid="B3"><sup>3</sup></xref>. Obesity is typically defined by a body-mass index (BMI) of 30 kg/m² or higher, but this doesn’t fully capture the health risks associated with even modest overweight or intra-abdominal fat <xref ref-type="bibr" rid="B1"><sup>1</sup></xref>. The global rise in obesity is due to genetic factors, easy access to high-calorie foods, and reduced physical activity in modern society. It’s no longer just a cosmetic issue but a global epidemic threatening overall well-being.</p>
			<p>Machine learning algorithms have revolutionized the health sector by enabling powerful classification techniques for various tasks such as disease diagnosis, risk prediction, and treatment recommendation. These algorithms leverage large datasets to identify patterns and relationships within medical data, allowing for more accurate and efficient decision-making. They have demonstrated remarkable power in distinguishing between disease states, stratifying patients based on risk profiles, and optimizing treatment strategies <xref ref-type="bibr" rid="B4"><sup>4</sup></xref><sup>-</sup><xref ref-type="bibr" rid="B6"><sup>6</sup></xref>. </p>
			<p>Machine learning algorithms have been instrumental in classifying obesity within human healthcare, offering a nuanced approach to understanding and addressing this complex condition. These algorithms utilize diverse data sources such as electronic health records, medical imaging, genetic information, and lifestyle factors to accurately classify individuals based on their obesity status. Through sophisticated pattern recognition techniques, machine learning models can differentiate between various degrees of obesity, assess associated health risks, and personalize intervention strategies accordingly. These algorithms not only consider traditional metrics like BMI but also incorporate additional factors such as body composition, metabolic markers, and genetic predispositions to provide a more comprehensive evaluation of obesity and its implications for overall health. By leveraging the power of machine learning, healthcare professionals can enhance obesity classification accuracy, tailor interventions to individual needs, and ultimately improve patient outcomes <xref ref-type="bibr" rid="B7"><sup>7</sup></xref><sup>,</sup><xref ref-type="bibr" rid="B8"><sup>8</sup></xref>.</p>
			<p>The authors in <xref ref-type="bibr" rid="B8"><sup>8</sup></xref> employed machine learning algorithms for predicting obesity risk, leveraging a dataset comprising over 1,100 individuals spanning diverse age groups and obesity statuses. Nine prominent machine learning algorithms were applied and evaluated, including k-nearest neighbor (k-NN), random forest, logistic regression, multilayer perceptron, support vector machine (SVM), naïve Bayes, adaptive boosting, decision tree, and gradient boosting classifier. The logistic regression algorithm demonstrated the highest accuracy at 97.1%, outperforming other classifiers, while the gradient boosting algorithm exhibited the lowest accuracy at 64.1%. This research aimed to predict obesity risk but also sought to enhance understanding of the underlying factors contributing to obesity, informing preventive strategies and interventions.</p>
			<p>The authors in <xref ref-type="bibr" rid="B9"><sup>9</sup></xref> utilized machine learning (ML) methods such as Logistic Regression, Classification and Regression Trees (CART), and Naïve Bayes to predict obesity using publicly available health data, aiming to surpass traditional models and identify key risk factors. Logistic Regression emerges as the most effective method, though moderate concordance between predicted and measured obesity is observed. Significant risk factors for obesity in adults include location, marital status, age groups, education, dietary habits, mental health disorders, physical activity, and smoking. Addressing data imbalance using Synthetic Minority Oversampling Technique (SMOTE), this study underscores the importance of identifying these risk factors to inform policy interventions aimed at controlling chronic diseases, especially obesity-related complications. Applying ML methods to available health data holds promise for advancing our understanding of obesity and its associated risk factors, facilitating more robust preventive strategies.</p>
			<p>A study by <xref ref-type="bibr" rid="B10"><sup>10</sup></xref> introduces a novel method utilizing data science techniques to analyze genetic variants extracted from publicly available genetic profiles and a curated database, the National Human Genome Research Institute Catalog, for predicting obesity. Genetic variants are indexed and utilized as inputs in various machine learning algorithms to classify participants into Normal Class or Risk Class based on body mass index status. A set of principal variables consisting of 13 Single Nucleotide Polymorphisms is generated through dimensionality reduction to apply different machine-learning methods. The performance of various algorithms, including gradient boosting, generalized linear model, classification and regression trees, <italic>k</italic>-nearest neighbors, support vector machines, random forest, and multilayer perceptron neural network, is evaluated using receiver operator characteristic curves and area under the curve metrics. The support vector machine demonstrated the highest performance with an area under the curve value of 90.5, suggesting its efficacy in identifying significant factors among the initial 6,622 variables for classifying subjects into BMI-related classes.</p>
			<p>A study by <xref ref-type="bibr" rid="B11"><sup>11</sup></xref> utilized four enhanced machine learning models to predict obesity among high school students, considering both risk and protective factors. The models included binary logistic regression, improved decision tree, weighted <italic>k</italic>-nearest neighbor, and artificial neural network (ANN). Using nine health-related behaviors from the 2015 Youth Risk Behavior Surveillance System for Tennessee as inputs, the results indicated significant performance improvements over traditional logistic regression. Specifically, the improved decision tree model achieved 80.2% accuracy and 90.7% specificity, the weighted KNN model achieved 88.8% accuracy and 93.4% specificity, and the ANN model achieved 84.2% accuracy and 99.5% specificity. These findings suggest the potential of machine learning in effectively predicting and addressing adolescent obesity, with implications for interventions aimed at slowing its increase.</p>
			<p>A study by <xref ref-type="bibr" rid="B12"><sup>12</sup></xref> focused on exploring the relationship between physical activity and weight status and evaluating the performance of various machine learning and traditional statistical methods. Utilizing National Health and Nutrition Examination Survey data from 2003 to 2006, the study included 7,162 participants meeting inclusion criteria. Eleven classification algorithms were implemented, including logistic regression, naïve Bayes, and Radial Basis Function. The random subspace classifier algorithm demonstrated the highest accuracy and area under the receiver operating characteristic curve. Vigorous and moderate-intensity activity durations emerged as significant attributes. While logistic regression ranked middling among the methods, it provided valuable insights. Gender, age, and race/ethnicity also played essential roles in weight outcomes. Tailored intervention strategies considering these factors are crucial in combating obesity and addressing disparities among demographic populations.</p>
			<p>As we delve into the analysis, it becomes evident oversampling techniques, notably the Synthetic Minority Over-sampling Technique (SMOTE), are integral components of the model development process. This study will incorporate this technique to address class imbalance in the dataset and enhance the robustness of model training and evaluation processes. In this study, we propose to predict obesity by employing various classification models: Logistic Regression, KNN, Random Forest, and deep learning. Recognizing the class imbalance inherent in obesity data, we aim to address this challenge by implementing the Synthetic Minority Over-sampling Technique (SMOTE). Our methodology involves training these models before and after oversampling, allowing for a comparative performance analysis. To evaluate the effectiveness of each model, we will employ a range of performance metrics, including Accuracy, Balanced Accuracy, Sensitivity, Specificity, F1-Score, and Precision. This approach aims to fill the existing research gap by comprehensively evaluating obesity prediction models before and after addressing class imbalance, thereby contributing to advancements in obesity management strategies.</p>
		</sec>
		<sec sec-type="materials|methods">
			<title>Materials and methods</title>
			<sec>
				<title>Data</title>
				<p>The dataset used in this study consists of 150 entries and 14 columns. Each column represents a variable providing information about individuals. <xref ref-type="table" rid="t2">Table 1</xref> presents an overview of the key variables, their meanings, and types.</p>
				<p>
					<table-wrap id="t2">
						<label>Table 1</label>
						<caption>
							<title>Variable Overview</title>
						</caption>
						<table>
							<colgroup>
								<col/>
								<col/>
								<col/>
							</colgroup>
							<thead>
								<tr>
									<th align="left">Variable Name</th>
									<th align="left">Meaning</th>
									<th align="left">Type</th>
								</tr>
							</thead>
							<tbody>
								<tr>
									<td align="left">Level</td>
									<td align="left">Academic level of the individual</td>
									<td align="left">Continuous</td>
								</tr>
								<tr>
									<td align="left">Faculty</td>
									<td align="left">Faculty affiliation</td>
									<td align="left">Categorical</td>
								</tr>
								<tr>
									<td align="left">Gender</td>
									<td align="left">Gender of the individual</td>
									<td align="left">Categorical</td>
								</tr>
								<tr>
									<td align="left">Age</td>
									<td align="left">Age of the individual</td>
									<td align="left">Continuous</td>
								</tr>
								<tr>
									<td align="left">Family Size</td>
									<td align="left">Size of the family</td>
									<td align="left">Continuous</td>
								</tr>
								<tr>
									<td align="left">Obese</td>
									<td align="left">Obesity status (Target)</td>
									<td align="left">Categorical</td>
								</tr>
								<tr>
									<td align="left">Income</td>
									<td align="left">Individual’s income level</td>
									<td align="left">Continuous</td>
								</tr>
								<tr>
									<td align="left">Daily Eating</td>
									<td align="left">Daily eating habits</td>
									<td align="left">Categorical</td>
								</tr>
								<tr>
									<td align="left">Fruit Intake</td>
									<td align="left">Frequency of fruit intake</td>
									<td align="left">Continuous</td>
								</tr>
								<tr>
									<td align="left">BMI Aware</td>
									<td align="left">Awareness of BMI</td>
									<td align="left">Categorical</td>
								</tr>
								<tr>
									<td align="left">Sleeping Hours</td>
									<td align="left">Hours of sleep per day</td>
									<td align="left">Continuous</td>
								</tr>
								<tr>
									<td align="left">Exercise Freq</td>
									<td align="left">Frequency of exercise</td>
									<td align="left">Continuous</td>
								</tr>
								<tr>
									<td align="left">Need help</td>
									<td align="left">Indicates if help is needed</td>
									<td align="left">Categorical</td>
								</tr>
								<tr>
									<td align="left">Need app</td>
									<td align="left">Indicates the need for an app</td>
									<td align="left">Categorical</td>
								</tr>
							</tbody>
						</table>
					</table-wrap>
				</p>
				<p>The dataset provides a mix of continuous and categorical variables, with “Obese” serving as the target variable indicating the obesity status of individuals. These variables lay the foundation for subsequent modeling and analysis.</p>
			</sec>
			<sec>
				<title>Dealing with class imbalance</title>
				<p>SMOTE is a powerful method used to address class imbalance in datasets by generating synthetic samples of the minority class <xref ref-type="bibr" rid="B13"><sup>13</sup></xref><sup>,</sup><xref ref-type="bibr" rid="B14"><sup>14</sup></xref>. It works by synthesizing new instances of the minority class by interpolating between existing minority class instances, thus creating a more balanced dataset. SMOTE helps to mitigate the problem of biased classification models that tend to favor the majority class due to its higher representation in the dataset. By introducing synthetic samples, SMOTE enhances the diversity of the minority class, allowing machine learning algorithms to better learn the underlying patterns and improve classification performance <xref ref-type="bibr" rid="B15"><sup>15</sup></xref><sup>-</sup><xref ref-type="bibr" rid="B17"><sup>17</sup></xref>. This technique has been widely adopted in various fields, including healthcare <xref ref-type="bibr" rid="B18"><sup>18</sup></xref><sup>-</sup><xref ref-type="bibr" rid="B20"><sup>20</sup></xref>, finance <xref ref-type="bibr" rid="B21"><sup>21</sup></xref><sup>-</sup><xref ref-type="bibr" rid="B23"><sup>23</sup></xref>, and image recognition <xref ref-type="bibr" rid="B24"><sup>24</sup></xref><sup>,</sup><xref ref-type="bibr" rid="B25"><sup>25</sup></xref>, where imbalanced datasets are common, contributing to more accurate and robust predictive models. The steps on how SMOTE works are shown below:</p>
				<p>
					<list list-type="bullet">
						<list-item>
							<p>Identify the Minority Class: Determine the minority class in the dataset that needs over-sampling.</p>
						</list-item>
						<list-item>
							<p>Select a Sample from the Minority Class: Randomly choose a sample from the minority class, denoted as 𝑥.</p>
						</list-item>
						<list-item>
							<p>Find Nearest Neighbors: Identify 𝑘-nearest neighbors of 𝑥 from the same class. This is usually done using a distance metric like Euclidean distance. For a given sample 𝑥 from the minority class, find its 𝑘-nearest neighbors using Euclidean distance:</p>
						</list-item>
					</list>
				</p>
				<p>
					<disp-formula id="e1">
						<mml:math>
							<mml:mi>d</mml:mi>
							<mml:mfenced separators="|">
								<mml:mrow>
									<mml:mi>x</mml:mi>
									<mml:mo>,</mml:mo>
									<mml:mi>y</mml:mi>
								</mml:mrow>
							</mml:mfenced>
							<mml:mo>=</mml:mo>
							<mml:mi> </mml:mi>
							<mml:msqrt>
								<mml:mrow>
									<mml:munderover>
										<mml:mo stretchy="false">∑</mml:mo>
										<mml:mrow>
											<mml:mi>i</mml:mi>
											<mml:mo>=</mml:mo>
											<mml:mn>1</mml:mn>
										</mml:mrow>
										<mml:mrow>
											<mml:mi>n</mml:mi>
										</mml:mrow>
									</mml:munderover>
									<mml:mrow>
										<mml:msup>
											<mml:mrow>
												<mml:mfenced separators="|">
													<mml:mrow>
														<mml:msub>
															<mml:mrow>
																<mml:mi>x</mml:mi>
															</mml:mrow>
															<mml:mrow>
																<mml:mn>1</mml:mn>
															</mml:mrow>
														</mml:msub>
														<mml:mo>-</mml:mo>
														<mml:msub>
															<mml:mrow>
																<mml:mi>y</mml:mi>
															</mml:mrow>
															<mml:mrow>
																<mml:mn>1</mml:mn>
															</mml:mrow>
														</mml:msub>
													</mml:mrow>
												</mml:mfenced>
											</mml:mrow>
											<mml:mrow>
												<mml:mn>2</mml:mn>
											</mml:mrow>
										</mml:msup>
									</mml:mrow>
								</mml:mrow>
							</mml:msqrt>
						</mml:math>
						<label>(1)</label>
					</disp-formula>
				</p>
				<p>Where (𝑥 = 𝑥<sub>1</sub>, 𝑥<sub>2</sub>, ⋯, 𝑥<sub>𝑛</sub>) 𝑦 = (𝑦<sub>1</sub>, 𝑦<sub>2</sub>, ⋯, 𝑦<sub>𝑛</sub>) are feature vectors.</p>
				<p>
					<list list-type="bullet">
						<list-item>
							<p>Generate Synthetic Samples: Randomly select one of the 𝑘-nearest neighbors, denoted as 𝑥 <sub>neighbor</sub>. Create a synthetic sample 𝑥<sub>syntetic</sub> by interpolating between 𝑥 and 𝑥<sub>neighbor</sub>:</p>
						</list-item>
					</list>
				</p>
				<p>
					<disp-formula id="e2">
						<mml:math>
							<mml:msub>
								<mml:mrow>
									<mml:mi>x</mml:mi>
								</mml:mrow>
								<mml:mrow>
									<mml:mi>s</mml:mi>
									<mml:mi>y</mml:mi>
									<mml:mi>n</mml:mi>
									<mml:mi>t</mml:mi>
									<mml:mi>e</mml:mi>
									<mml:mi>t</mml:mi>
									<mml:mi>i</mml:mi>
									<mml:mi>c</mml:mi>
								</mml:mrow>
							</mml:msub>
							<mml:mo>=</mml:mo>
							<mml:mi>x</mml:mi>
							<mml:mo>+</mml:mo>
							<mml:mi> </mml:mi>
							<mml:mi>δ</mml:mi>
							<mml:mo>∙</mml:mo>
							<mml:msub>
								<mml:mrow>
									<mml:mi>x</mml:mi>
								</mml:mrow>
								<mml:mrow>
									<mml:mi>n</mml:mi>
									<mml:mi>e</mml:mi>
									<mml:mi>i</mml:mi>
									<mml:mi>g</mml:mi>
									<mml:mi>h</mml:mi>
									<mml:mi>b</mml:mi>
									<mml:mi>o</mml:mi>
									<mml:mi>r</mml:mi>
								</mml:mrow>
							</mml:msub>
							<mml:mo>-</mml:mo>
							<mml:mi>x</mml:mi>
						</mml:math>
						<label>(2)</label>
					</disp-formula>
				</p>
				<p>where δ is a random number between 0 and 1.</p>
				<p>
					<xref ref-type="fig" rid="f1">Figure 1</xref> below show the Pseudo code for SMOTE and <xref ref-type="fig" rid="f2">Figure 2</xref> show an illustration of SMOTE. Apart from its application in prediction of heart disease, SMOTE has widely been used in other health events where prediction is warranted. For instance, a study <xref ref-type="bibr" rid="B26"><sup>26</sup></xref> used the SMOTE technique to balance the dataset for improving the prediction performance of heart disease using the Decision Tree algorithm on the Cleveland Heart Disease Dataset. The SMOTE technique addressed data imbalances between minority and majority classes. Experimental results showed that balancing the data with SMOTE improved decision tree classification accuracy from 73.3% to 91.4%, an increase of up to 18.1%. A study <xref ref-type="bibr" rid="B27"><sup>27</sup></xref> used the SMOTE technique to balance the dataset and improve the accuracy of rule induction and decision tree models for predicting kidney disease using data from Apollo Hospitals, Tamil Nadu, India. The initial imbalanced dataset hindered model accuracy but applying SMOTE minimized class variation. Experimental findings showed an average accuracy improvement to 98.73%. This method can also enhance accuracy in other imbalanced datasets and be applied in Big Data contexts using Hadoop and MapReduce. A study <xref ref-type="bibr" rid="B28"><sup>28</sup></xref> proposed a two-step approach to improve predictive accuracy in healthcare, addressing the limitations of basic SMOTE. First, modified SMOTE techniques Distance-based SMOTE was used to reduce class imbalance and showed improved accuracy over basic SMOTE. Second, a Stacking Ensemble Framework combining machine learning, deep learning, and ensemble algorithms significantly increased accuracy to 96-97% for various datasets. This framework was validated using the Framingham dataset, Wisconsin Hospital data, and Novel Coronavirus 2019 dataset.</p>
				<p>
					<fig id="f1">
						<label>Figure 1</label>
						<caption>
							<title>SMOTE Pseudo code</title>
						</caption>
						<graphic xlink:href="2665-427X-ijeph-7-01-e-11532-gf1.png"/>
					</fig>
				</p>
				<p>
					<fig id="f2">
						<label>Figure 2</label>
						<caption>
							<title>A Simple illustration of SMOTE</title>
						</caption>
						<graphic xlink:href="2665-427X-ijeph-7-01-e-11532-gf2.png"/>
					</fig>
				</p>
			</sec>
			<sec>
				<title>Classification Models</title>
				<p>The classification models chosen for this analysis include Logistic Regression, 𝐾-Nearest Neighbors (KNN), Random Forest and Deep Learning</p>
			</sec>
			<sec>
				<title>Logistic Regression</title>
				<p>Logistic Regression is a widely used linear classification algorithm that models the probability of a binary outcome. It is particularly suitable for predicting binary variables, such as whether an individual is obese or not. The logistic regression model predicts the log-odds (logit) of the probability of the positive class <xref ref-type="bibr" rid="B29"><sup>29</sup></xref> In logistic regression, the hypothesis function ℎ<sub>θ</sub>(𝑥) is defined as:</p>
				<p>
					<disp-formula id="e3">
						<mml:math>
							<mml:msub>
								<mml:mrow>
									<mml:mi>h</mml:mi>
								</mml:mrow>
								<mml:mrow>
									<mml:mi>θ</mml:mi>
								</mml:mrow>
							</mml:msub>
							<mml:mfenced separators="|">
								<mml:mrow>
									<mml:mi>x</mml:mi>
								</mml:mrow>
							</mml:mfenced>
							<mml:mo>=</mml:mo>
							<mml:mi>g</mml:mi>
							<mml:mo>(</mml:mo>
							<mml:msup>
								<mml:mrow>
									<mml:mi>θ</mml:mi>
								</mml:mrow>
								<mml:mrow>
									<mml:mi>T</mml:mi>
								</mml:mrow>
							</mml:msup>
							<mml:mi>x</mml:mi>
							<mml:mo>)</mml:mo>
						</mml:math>
						<label>(3)</label>
					</disp-formula>
				</p>
				<p>where the function 𝑔 is the sigmoid function, represented as:</p>
				<p>
					<disp-formula id="e4">
						<mml:math>
							<mml:mi>g</mml:mi>
							<mml:mfenced separators="|">
								<mml:mrow>
									<mml:mi>z</mml:mi>
								</mml:mrow>
							</mml:mfenced>
							<mml:mo>=</mml:mo>
							<mml:mi> </mml:mi>
							<mml:mfrac>
								<mml:mrow>
									<mml:mn>1</mml:mn>
								</mml:mrow>
								<mml:mrow>
									<mml:mn>1</mml:mn>
									<mml:mo>+</mml:mo>
									<mml:msup>
										<mml:mrow>
											<mml:mi>e</mml:mi>
										</mml:mrow>
										<mml:mrow>
											<mml:mo>-</mml:mo>
											<mml:mi>z</mml:mi>
										</mml:mrow>
									</mml:msup>
								</mml:mrow>
							</mml:mfrac>
							<mml:mi> </mml:mi>
						</mml:math>
						<label>(4)</label>
					</disp-formula>
				</p>
				<p>The sigmoid function ensures that the output of the hypothesis lies between 0 and 1, making it suitable for binary classification tasks.</p>
				<p>Regarding the loss function in logistic regression, it is commonly defined as the logistic loss or binary cross-entropy loss. For a single training example with true label <italic>y</italic> and predicted probability ℎ<sub>θ</sub>(𝑥), the logistic loss is computed as:</p>
				<p>
					<disp-formula id="e5">
						<mml:math>
							<mml:mi>L</mml:mi>
							<mml:mi>o</mml:mi>
							<mml:mi>s</mml:mi>
							<mml:mi>s</mml:mi>
							<mml:mfenced separators="|">
								<mml:mrow>
									<mml:msub>
										<mml:mrow>
											<mml:mi>h</mml:mi>
										</mml:mrow>
										<mml:mrow>
											<mml:mi>θ</mml:mi>
										</mml:mrow>
									</mml:msub>
									<mml:mfenced separators="|">
										<mml:mrow>
											<mml:mi>x</mml:mi>
										</mml:mrow>
									</mml:mfenced>
									<mml:mo>,</mml:mo>
									<mml:mi>y</mml:mi>
								</mml:mrow>
							</mml:mfenced>
							<mml:mo>=</mml:mo>
							<mml:mo>-</mml:mo>
							<mml:mi>y</mml:mi>
							<mml:mrow>
								<mml:mrow>
									<mml:mi mathvariant="normal">log</mml:mi>
								</mml:mrow>
								<mml:mo>⁡</mml:mo>
								<mml:mrow>
									<mml:mfenced separators="|">
										<mml:mrow>
											<mml:msub>
												<mml:mrow>
													<mml:mi>h</mml:mi>
												</mml:mrow>
												<mml:mrow>
													<mml:mi>θ</mml:mi>
												</mml:mrow>
											</mml:msub>
											<mml:mfenced separators="|">
												<mml:mrow>
													<mml:mi>x</mml:mi>
												</mml:mrow>
											</mml:mfenced>
										</mml:mrow>
									</mml:mfenced>
								</mml:mrow>
							</mml:mrow>
							<mml:mo>-</mml:mo>
							<mml:mfenced separators="|">
								<mml:mrow>
									<mml:mn>1</mml:mn>
									<mml:mo>-</mml:mo>
									<mml:mi>y</mml:mi>
								</mml:mrow>
							</mml:mfenced>
							<mml:mi mathvariant="normal"> </mml:mi>
							<mml:mi mathvariant="normal">l</mml:mi>
							<mml:mi mathvariant="normal">o</mml:mi>
							<mml:mi mathvariant="normal">g</mml:mi>
							<mml:mo>⁡</mml:mo>
							<mml:mo>(</mml:mo>
							<mml:mn>1</mml:mn>
							<mml:mo>-</mml:mo>
							<mml:msub>
								<mml:mrow>
									<mml:mi>h</mml:mi>
								</mml:mrow>
								<mml:mrow>
									<mml:mi>θ</mml:mi>
								</mml:mrow>
							</mml:msub>
							<mml:mfenced separators="|">
								<mml:mrow>
									<mml:mi>x</mml:mi>
								</mml:mrow>
							</mml:mfenced>
							<mml:mo>)</mml:mo>
						</mml:math>
						<label>(5)</label>
					</disp-formula>
				</p>
				<p>The objective is to minimize this loss function over the entire training dataset. The loss function penalizes the model more heavily for incorrect predictions, particularly when the predicted probability diverges significantly from the true label. Minimizing this loss function effectively adjusts the model parameters θ to improve the accuracy of predictions in logistic regression.</p>
			</sec>
			<sec>
				<title>K-Nearest Neighbors (KNN)</title>
				<p>The 𝑘-nearest neighbors (KNN) algorithm is a simple, yet effective method used for classification tasks in machine learning. It operates based on the principle of similarity: If an instance is like other instances in the dataset, it is likely to belong to the same class <xref ref-type="bibr" rid="B30"><sup>30</sup></xref><sup>,</sup><xref ref-type="bibr" rid="B31"><sup>31</sup></xref>. </p>
				<p>Here’s how the KNN algorithm works:</p>
				<p>
					<list list-type="bullet">
						<list-item>
							<p>Training Phase: The algorithm stores all the available training data points and their corresponding class labels.</p>
						</list-item>
						<list-item>
							<p>Prediction Phase: When a new instance (point) is presented for classification, the KNN algorithm calculates the distances between this instance and all other instances in the training dataset. The distance measure used is typically Euclidean distance, although other distance metrics such as Manhattan distance or cosine similarity can also be used.</p>
						</list-item>
						<list-item>
							<p>Finding Neighbors: Once distances are calculated, the algorithm identifies the k-nearest neighbors of the new instance. These are the k data points in the training set that are closest to the new instance.</p>
						</list-item>
						<list-item>
							<p>Majority Voting: Finally, the algorithm assigns the class label to the new instance based on a majority vote among its k nearest neighbors. That is, the class label that occurs most frequently among the 𝑘 neighbors is assigned to the new instance.</p>
						</list-item>
					</list>
				</p>
				<p>The working framework of KNN involves calculating the Euclidean distance between the new instance (denoted as 𝑥) and each point in the training dataset (denoted as 𝑦). The Euclidean distance between two points 𝑥 and 𝑦 in an 𝑛-dimensional space can be calculated using the following formula:</p>
				<p>
					<disp-formula id="e6">
						<mml:math>
							<mml:mi mathvariant="normal">E</mml:mi>
							<mml:mi mathvariant="normal">u</mml:mi>
							<mml:mi mathvariant="normal">c</mml:mi>
							<mml:mi mathvariant="normal">l</mml:mi>
							<mml:mi mathvariant="normal">i</mml:mi>
							<mml:mi mathvariant="normal">d</mml:mi>
							<mml:mi mathvariant="normal">e</mml:mi>
							<mml:mi mathvariant="normal">a</mml:mi>
							<mml:mi mathvariant="normal">n</mml:mi>
							<mml:mi mathvariant="normal"> </mml:mi>
							<mml:mi mathvariant="normal">D</mml:mi>
							<mml:mi mathvariant="normal">i</mml:mi>
							<mml:mi mathvariant="normal">s</mml:mi>
							<mml:mi mathvariant="normal">t</mml:mi>
							<mml:mi mathvariant="normal">a</mml:mi>
							<mml:mi mathvariant="normal">n</mml:mi>
							<mml:mi mathvariant="normal">c</mml:mi>
							<mml:mi mathvariant="normal">e</mml:mi>
							<mml:mo>=</mml:mo>
							<mml:mi mathvariant="normal"> </mml:mi>
							<mml:msqrt>
								<mml:mrow>
									<mml:munderover>
										<mml:mo stretchy="false">∑</mml:mo>
										<mml:mrow>
											<mml:mi>i</mml:mi>
											<mml:mo>=</mml:mo>
											<mml:mn>1</mml:mn>
										</mml:mrow>
										<mml:mrow>
											<mml:mi>n</mml:mi>
										</mml:mrow>
									</mml:munderover>
									<mml:mrow>
										<mml:msup>
											<mml:mrow>
												<mml:mo>(</mml:mo>
												<mml:msub>
													<mml:mrow>
														<mml:mi>x</mml:mi>
													</mml:mrow>
													<mml:mrow>
														<mml:mn>1</mml:mn>
													</mml:mrow>
												</mml:msub>
												<mml:mo>-</mml:mo>
												<mml:msub>
													<mml:mrow>
														<mml:mi>y</mml:mi>
													</mml:mrow>
													<mml:mrow>
														<mml:mn>1</mml:mn>
													</mml:mrow>
												</mml:msub>
												<mml:mo>)</mml:mo>
											</mml:mrow>
											<mml:mrow>
												<mml:mn>2</mml:mn>
											</mml:mrow>
										</mml:msup>
									</mml:mrow>
								</mml:mrow>
							</mml:msqrt>
						</mml:math>
						<label>(6)</label>
					</disp-formula>
				</p>
				<p>Where:</p>
				<p>𝑥<sub>1</sub> and 𝑦<sub>1</sub> are the 𝑖-th dimensions of points 𝑥 and 𝑦 respectively.</p>
				<p>𝑛 is the number of dimensions (or features) in the dataset.</p>
				<p>By calculating the Euclidean distance between the new instance and each point in the training dataset, the KNN algorithm identifies the 𝑘-nearest neighbors, which are then used for classification.</p>
			</sec>
			<sec>
				<title>Naive Bayes</title>
				<p>Bayesian classification, a supervised learning and statistical method, operates on an underlying probabilistic model, allowing for the quantification of uncertainty in outcomes. It effectively addresses predictive problems by determining probabilities associated with various outcomes <xref ref-type="bibr" rid="B31"><sup>31</sup></xref><sup>,</sup><xref ref-type="bibr" rid="B32"><sup>32</sup></xref>. In Naive Bayes classification, we aim to predict the class label 𝑦 based on the features 𝑥<sub>1</sub>, 𝑥<sub>2</sub>, ⋯, 𝑥<sub>𝑝</sub>. We assume that the features are conditionally independent given the class label <italic>y</italic>, which means that:</p>
				<p>
					<disp-formula id="e7">
						<mml:math>
							<mml:mi>P</mml:mi>
							<mml:mfenced separators="|">
								<mml:mrow>
									<mml:msub>
										<mml:mrow>
											<mml:mi>x</mml:mi>
										</mml:mrow>
										<mml:mrow>
											<mml:mn>1</mml:mn>
										</mml:mrow>
									</mml:msub>
									<mml:mo>,</mml:mo>
									<mml:mi> </mml:mi>
									<mml:msub>
										<mml:mrow>
											<mml:mi>x</mml:mi>
										</mml:mrow>
										<mml:mrow>
											<mml:mn>2</mml:mn>
										</mml:mrow>
									</mml:msub>
									<mml:mo>,</mml:mo>
									<mml:mo>⋯</mml:mo>
									<mml:mo>,</mml:mo>
									<mml:msub>
										<mml:mrow>
											<mml:mi>x</mml:mi>
										</mml:mrow>
										<mml:mrow>
											<mml:mi>p</mml:mi>
										</mml:mrow>
									</mml:msub>
								</mml:mrow>
								<mml:mrow>
									<mml:mi>y</mml:mi>
									<mml:mo>=</mml:mo>
									<mml:mi>c</mml:mi>
								</mml:mrow>
							</mml:mfenced>
							<mml:mo>=</mml:mo>
							<mml:mi>P</mml:mi>
							<mml:mfenced separators="|">
								<mml:mrow>
									<mml:msub>
										<mml:mrow>
											<mml:mi>x</mml:mi>
										</mml:mrow>
										<mml:mrow>
											<mml:mn>1</mml:mn>
										</mml:mrow>
									</mml:msub>
									<mml:mo>,</mml:mo>
									<mml:msub>
										<mml:mrow>
											<mml:mi>x</mml:mi>
										</mml:mrow>
										<mml:mrow>
											<mml:mn>2</mml:mn>
										</mml:mrow>
									</mml:msub>
									<mml:mo>,</mml:mo>
									<mml:mo>⋯</mml:mo>
									<mml:mo>,</mml:mo>
									<mml:msub>
										<mml:mrow>
											<mml:mi>x</mml:mi>
										</mml:mrow>
										<mml:mrow>
											<mml:mi>p</mml:mi>
										</mml:mrow>
									</mml:msub>
								</mml:mrow>
								<mml:mrow>
									<mml:mi>y</mml:mi>
									<mml:mo>=</mml:mo>
									<mml:mi>c</mml:mi>
								</mml:mrow>
							</mml:mfenced>
							<mml:mi> </mml:mi>
							<mml:mo>×</mml:mo>
							<mml:mfenced separators="|">
								<mml:mrow>
									<mml:msub>
										<mml:mrow>
											<mml:mi>x</mml:mi>
										</mml:mrow>
										<mml:mrow>
											<mml:mn>2</mml:mn>
										</mml:mrow>
									</mml:msub>
								</mml:mrow>
								<mml:mrow>
									<mml:msub>
										<mml:mrow>
											<mml:mi>x</mml:mi>
										</mml:mrow>
										<mml:mrow>
											<mml:mn>3</mml:mn>
										</mml:mrow>
									</mml:msub>
									<mml:mo>⋯</mml:mo>
									<mml:msub>
										<mml:mrow>
											<mml:mi>x</mml:mi>
										</mml:mrow>
										<mml:mrow>
											<mml:mi>p</mml:mi>
										</mml:mrow>
									</mml:msub>
									<mml:mo>,</mml:mo>
									<mml:mi> </mml:mi>
									<mml:mi>y</mml:mi>
									<mml:mo>=</mml:mo>
									<mml:mi>c</mml:mi>
								</mml:mrow>
							</mml:mfenced>
							<mml:mo>×</mml:mo>
							<mml:mo>⋯</mml:mo>
							<mml:mo>×</mml:mo>
							<mml:mi> </mml:mi>
							<mml:mi>P</mml:mi>
							<mml:mfenced separators="|">
								<mml:mrow>
									<mml:msub>
										<mml:mrow>
											<mml:mi>x</mml:mi>
										</mml:mrow>
										<mml:mrow>
											<mml:mi>p</mml:mi>
										</mml:mrow>
									</mml:msub>
								</mml:mrow>
								<mml:mrow>
									<mml:mi>y</mml:mi>
									<mml:mo>=</mml:mo>
									<mml:mi>c</mml:mi>
								</mml:mrow>
							</mml:mfenced>
						</mml:math>
					</disp-formula>
				</p>
				<p>We can then apply Bayes’ theorem to calculate the probability of each class given the observed features:</p>
				<p>
					<disp-formula id="e8">
						<mml:math>
							<mml:mi>P</mml:mi>
							<mml:mfenced separators="|">
								<mml:mrow>
									<mml:mi>y</mml:mi>
									<mml:mo>=</mml:mo>
									<mml:mi>c</mml:mi>
								</mml:mrow>
								<mml:mrow>
									<mml:msub>
										<mml:mrow>
											<mml:mi>x</mml:mi>
										</mml:mrow>
										<mml:mrow>
											<mml:mn>1</mml:mn>
										</mml:mrow>
									</mml:msub>
									<mml:mo>,</mml:mo>
									<mml:mi> </mml:mi>
									<mml:msub>
										<mml:mrow>
											<mml:mi>x</mml:mi>
										</mml:mrow>
										<mml:mrow>
											<mml:mn>2</mml:mn>
										</mml:mrow>
									</mml:msub>
									<mml:mo>,</mml:mo>
									<mml:mo>⋯</mml:mo>
									<mml:mo>,</mml:mo>
									<mml:msub>
										<mml:mrow>
											<mml:mi>x</mml:mi>
										</mml:mrow>
										<mml:mrow>
											<mml:mi>p</mml:mi>
										</mml:mrow>
									</mml:msub>
								</mml:mrow>
							</mml:mfenced>
							<mml:mo>=</mml:mo>
							<mml:mfrac>
								<mml:mrow>
									<mml:mi>P</mml:mi>
									<mml:mfenced separators="|">
										<mml:mrow>
											<mml:msub>
												<mml:mrow>
													<mml:mi>x</mml:mi>
												</mml:mrow>
												<mml:mrow>
													<mml:mn>1</mml:mn>
												</mml:mrow>
											</mml:msub>
											<mml:mo>,</mml:mo>
											<mml:mi> </mml:mi>
											<mml:msub>
												<mml:mrow>
													<mml:mi>x</mml:mi>
												</mml:mrow>
												<mml:mrow>
													<mml:mn>2</mml:mn>
												</mml:mrow>
											</mml:msub>
											<mml:mo>,</mml:mo>
											<mml:mo>⋯</mml:mo>
											<mml:mo>,</mml:mo>
											<mml:msub>
												<mml:mrow>
													<mml:mi>x</mml:mi>
												</mml:mrow>
												<mml:mrow>
													<mml:mi>p</mml:mi>
												</mml:mrow>
											</mml:msub>
										</mml:mrow>
										<mml:mrow>
											<mml:mi>y</mml:mi>
											<mml:mo>=</mml:mo>
											<mml:mi>c</mml:mi>
										</mml:mrow>
									</mml:mfenced>
									<mml:mo>×</mml:mo>
									<mml:mi>P</mml:mi>
									<mml:mo>(</mml:mo>
									<mml:mi>y</mml:mi>
									<mml:mo>=</mml:mo>
									<mml:mi>c</mml:mi>
									<mml:mo>)</mml:mo>
								</mml:mrow>
								<mml:mrow>
									<mml:mi>P</mml:mi>
									<mml:mo>(</mml:mo>
									<mml:msub>
										<mml:mrow>
											<mml:mi>x</mml:mi>
										</mml:mrow>
										<mml:mrow>
											<mml:mn>1</mml:mn>
										</mml:mrow>
									</mml:msub>
									<mml:mo>,</mml:mo>
									<mml:msub>
										<mml:mrow>
											<mml:mi>x</mml:mi>
										</mml:mrow>
										<mml:mrow>
											<mml:mn>2</mml:mn>
										</mml:mrow>
									</mml:msub>
									<mml:mo>,</mml:mo>
									<mml:mo>⋯</mml:mo>
									<mml:msub>
										<mml:mrow>
											<mml:mi>x</mml:mi>
										</mml:mrow>
										<mml:mrow>
											<mml:mi>p</mml:mi>
										</mml:mrow>
									</mml:msub>
									<mml:mo>)</mml:mo>
								</mml:mrow>
							</mml:mfrac>
						</mml:math>
					</disp-formula>
				</p>
				<p>To classify a new instance with features 𝑥<sub>1</sub>, 𝑥<sub>2</sub>, ⋯ 𝑥<sub>𝑝</sub>, we select the class label that maximizes the posterior probability:</p>
				<p>
					<disp-formula id="e9">
						<mml:math>
							<mml:mover accent="true">
								<mml:mrow>
									<mml:mi>y</mml:mi>
								</mml:mrow>
								<mml:mo>^</mml:mo>
							</mml:mover>
							<mml:mo>=</mml:mo>
							<mml:mmultiscripts>
								<mml:mrow>
									<mml:mi> </mml:mi>
									<mml:mi>P</mml:mi>
									<mml:mfenced separators="|">
										<mml:mrow>
											<mml:mi>y</mml:mi>
											<mml:mo>=</mml:mo>
											<mml:mi>c</mml:mi>
										</mml:mrow>
										<mml:mrow>
											<mml:msub>
												<mml:mrow>
													<mml:mi>x</mml:mi>
												</mml:mrow>
												<mml:mrow>
													<mml:mn>1</mml:mn>
												</mml:mrow>
											</mml:msub>
											<mml:mo>,</mml:mo>
											<mml:mi> </mml:mi>
											<mml:msub>
												<mml:mrow>
													<mml:mi>x</mml:mi>
												</mml:mrow>
												<mml:mrow>
													<mml:mn>2</mml:mn>
												</mml:mrow>
											</mml:msub>
											<mml:mo>,</mml:mo>
											<mml:mo>⋯</mml:mo>
											<mml:mo>,</mml:mo>
											<mml:msub>
												<mml:mrow>
													<mml:mi>x</mml:mi>
												</mml:mrow>
												<mml:mrow>
													<mml:mi>p</mml:mi>
												</mml:mrow>
											</mml:msub>
										</mml:mrow>
									</mml:mfenced>
									<mml:mo>=</mml:mo>
								</mml:mrow>
								<mml:mprescripts/>
								<mml:mrow>
									<mml:mi>c</mml:mi>
								</mml:mrow>
								<mml:mrow>
									<mml:mrow>
										<mml:mrow>
											<mml:mi mathvariant="normal">arg</mml:mi>
										</mml:mrow>
										<mml:mo>⁡</mml:mo>
										<mml:mrow>
											<mml:mi>m</mml:mi>
											<mml:mi>a</mml:mi>
											<mml:mi>x</mml:mi>
										</mml:mrow>
									</mml:mrow>
								</mml:mrow>
							</mml:mmultiscripts>
							<mml:mmultiscripts>
								<mml:mrow>
									<mml:mi> </mml:mi>
									<mml:mi>P</mml:mi>
									<mml:mfenced separators="|">
										<mml:mrow>
											<mml:msub>
												<mml:mrow>
													<mml:mi>x</mml:mi>
												</mml:mrow>
												<mml:mrow>
													<mml:mn>1</mml:mn>
												</mml:mrow>
											</mml:msub>
											<mml:mo>,</mml:mo>
											<mml:mi> </mml:mi>
											<mml:msub>
												<mml:mrow>
													<mml:mi>x</mml:mi>
												</mml:mrow>
												<mml:mrow>
													<mml:mn>2</mml:mn>
												</mml:mrow>
											</mml:msub>
											<mml:mo>,</mml:mo>
											<mml:mo>⋯</mml:mo>
											<mml:mo>,</mml:mo>
											<mml:msub>
												<mml:mrow>
													<mml:mi>x</mml:mi>
												</mml:mrow>
												<mml:mrow>
													<mml:mi>p</mml:mi>
												</mml:mrow>
											</mml:msub>
										</mml:mrow>
										<mml:mrow>
											<mml:mi>y</mml:mi>
											<mml:mo>=</mml:mo>
											<mml:mi>c</mml:mi>
										</mml:mrow>
									</mml:mfenced>
									<mml:mo>×</mml:mo>
									<mml:mi>P</mml:mi>
									<mml:mo>(</mml:mo>
									<mml:mi>y</mml:mi>
									<mml:mo>=</mml:mo>
									<mml:mi>c</mml:mi>
									<mml:mo>)</mml:mo>
								</mml:mrow>
								<mml:mprescripts/>
								<mml:mrow>
									<mml:mi>c</mml:mi>
								</mml:mrow>
								<mml:mrow>
									<mml:mrow>
										<mml:mrow>
											<mml:mi mathvariant="normal">arg</mml:mi>
										</mml:mrow>
										<mml:mo>⁡</mml:mo>
										<mml:mrow>
											<mml:mi>m</mml:mi>
											<mml:mi>a</mml:mi>
											<mml:mi>x</mml:mi>
										</mml:mrow>
									</mml:mrow>
								</mml:mrow>
							</mml:mmultiscripts>
						</mml:math>
					</disp-formula>
				</p>
				<p>The classical procedure for Naive Bayes is as follows:</p>
				<p>
					<list list-type="bullet">
						<list-item>
							<p>Calculate Class Priors: Calculate the prior probabilities 𝑃 (𝑦 = 𝑐) for each class <italic>c</italic> based on the frequency of class labels in the training dataset.</p>
						</list-item>
						<list-item>
							<p>Calculate Class-Conditional Probabilities: For each feature 𝑥<sub>𝑖</sub>, calculate the class-conditional probability 𝑃(𝑥<sub>𝑖</sub>| 𝑦 = 𝑐) for each class 𝑐. This can be done using different probability distributions depending on the type of feature (e.g., Gaussian distribution for continuous features, multinomial distribution for discrete features).</p>
						</list-item>
						<list-item>
							<p>Calculate Posterior Probabilities: Using the conditional independence assumption, calculate the joint probability 𝑃(𝑥<sub>1</sub>, 𝑥<sub>2</sub>, ⋯, 𝑥<sub>𝑝</sub>|𝑦 = 𝑐) for each class <italic>c</italic> as the product of the individual class-conditional probabilities.</p>
						</list-item>
						<list-item>
							<p>Normalize Probabilities: Normalize the posterior probabilities 𝑃(𝑦 = 𝑐|𝑥<sub>1</sub>, 𝑥<sub>2</sub>, ⋯, 𝑥<sub>𝑝</sub>) by dividing each by the sum of all posterior probabilities.</p>
						</list-item>
						<list-item>
							<p>Classify New Instances: For each new instance with features 𝑥<sub>1</sub>, 𝑥<sub>2</sub>,⋯,𝑥<sub>𝑝</sub>, calculate the posterior probability for each class using Bayes’ theorem and select the class with the highest posterior probability as the predicted class label.</p>
						</list-item>
					</list>
				</p>
			</sec>
			<sec>
				<title>Deep learning</title>
				<p>Deep learning for classification is a sophisticated approach that leverages neural networks with multiple layers to categorize input data into distinct classes (<xref ref-type="fig" rid="f3">Figure 3</xref>). The process begins with data preprocessing, where input features are normalized or standardized to ensure uniformity in scale and the dataset is divided into training, validation, and testing sets for evaluation. Subsequently, the architecture of the neural network is defined, specifying the number of layers, neurons per layer, and activation functions. In classification tasks, the output layer typically comprises neurons equivalent to the number of classes, with a sigmoid or softmax activation function employed to generate class probabilities <xref ref-type="bibr" rid="B33"><sup>33</sup></xref>.</p>
				<p>
					<fig id="f3">
						<label>Figure 3</label>
						<caption>
							<title>A layout of Deep Learning Neural Network</title>
						</caption>
						<graphic xlink:href="2665-427X-ijeph-7-01-e-11532-gf3.png"/>
					</fig>
				</p>
				<p>
					<xref ref-type="fig" rid="f3">Figure 3</xref> show an illustration of deep learning neural network. During the forward propagation phase, input data traverse through the neural network, undergoing computations layer by layer to produce the network’s output. Each layer computes its output through a linear transformation followed by an activation function. To quantify the disparity between predicted output and true labels, an appropriate loss function is selected. For classification tasks, common choices include categorical cross-entropy for multi-class classification and binary cross-entropy for binary classification. The backpropagation process calculates gradients of the loss function with respect to model parameters and updates these parameters using an optimization algorithm, like stochastic gradient descent (SGD), Adam, or RMSprop <xref ref-type="bibr" rid="B34"><sup>34</sup></xref>.</p>
				<p>Training ensues by iteratively feeding batches of data through the network, computing loss, and adjusting parameters through backpropagation. Regular monitoring of the model’s performance on the validation set helps prevent overfitting and fine-tune hyperparameters. Finally, the model is evaluated on the test set to gauge its performance on unseen data. Metrics such as accuracy, precision, recall, and F1-score are computed to assess its efficacy. Activation functions, including sigmoid and ReLU play crucial roles in determining the output of neurons within the network, contributing to its overall classification capability. <xref ref-type="fig" rid="f4">Figure 4</xref> show the activation functions used in deep neural networks.</p>
				<p>
					<fig id="f4">
						<label>Figure 4</label>
						<caption>
							<title>Commonly used Activation function in Deep Learning</title>
						</caption>
						<graphic xlink:href="2665-427X-ijeph-7-01-e-11532-gf4.png"/>
					</fig>
				</p>
			</sec>
			<sec>
				<title>Model training and validation</title>
				<p>Due to the small size of the data, we utilized cross-validation to avoid overfitting. Cross-validation is a technique used to assess the performance of machine learning models by dividing the dataset into multiple subsets, or folds. Each fold is used as a validation set, while the rest of the data is used for training. This process is repeated multiple times for our case we will use 5 folds, with each fold serving as the validation set exactly once. The main advantage of cross-validation is that it provides a more robust estimate of the model’s performance compared to a single train-test split. In this scenario, train-test split was not utilized due to the relatively small size of the dataset, which could lead to high variability in model performance estimates. By using cross-validation, we ensure that each data point is used for both training and validation, leading to a more reliable assessment of the model’s generalization ability <xref ref-type="bibr" rid="B35"><sup>35</sup></xref><sup>,</sup><xref ref-type="bibr" rid="B36"><sup>36</sup></xref><sup>,</sup><xref ref-type="bibr" rid="B37"><sup>37</sup></xref>. The formula for calculating the average metric across all folds in cross-validation can be expressed as:</p>
				<p>
					<disp-formula id="e10">
						<mml:math>
							<mml:mi>A</mml:mi>
							<mml:mi>v</mml:mi>
							<mml:mi>e</mml:mi>
							<mml:mi>r</mml:mi>
							<mml:mi>a</mml:mi>
							<mml:mi>g</mml:mi>
							<mml:mi>e</mml:mi>
							<mml:mi> </mml:mi>
							<mml:mi>M</mml:mi>
							<mml:mi>e</mml:mi>
							<mml:mi>t</mml:mi>
							<mml:mi>r</mml:mi>
							<mml:mi>i</mml:mi>
							<mml:mi>c</mml:mi>
							<mml:mo>=</mml:mo>
							<mml:mi> </mml:mi>
							<mml:mfrac>
								<mml:mrow>
									<mml:mn>1</mml:mn>
								</mml:mrow>
								<mml:mrow>
									<mml:mi>k</mml:mi>
								</mml:mrow>
							</mml:mfrac>
							<mml:mrow>
								<mml:munderover>
									<mml:mo stretchy="false">∑</mml:mo>
									<mml:mrow>
										<mml:mi>i</mml:mi>
									</mml:mrow>
									<mml:mrow>
										<mml:mi>k</mml:mi>
									</mml:mrow>
								</mml:munderover>
								<mml:mrow>
									<mml:msub>
										<mml:mrow>
											<mml:mi>M</mml:mi>
											<mml:mi>e</mml:mi>
											<mml:mi>t</mml:mi>
											<mml:mi>r</mml:mi>
											<mml:mi>i</mml:mi>
											<mml:mi>c</mml:mi>
										</mml:mrow>
										<mml:mrow>
											<mml:mi>i</mml:mi>
										</mml:mrow>
									</mml:msub>
								</mml:mrow>
							</mml:mrow>
						</mml:math>
					</disp-formula>
				</p>
				<p>Where:</p>
				<p>
					<list list-type="bullet">
						<list-item>
							<p>𝑘 is the number of folds in cross-validation.</p>
						</list-item>
						<list-item>
							<p>Metric<sub>𝑖</sub> represents the metric value (such as accuracy, precision, recall, F1-score,balanced accuracy calculated for each fold 𝑖.</p>
						</list-item>
					</list>
				</p>
				<p>The <xref ref-type="fig" rid="f5">Figure 5</xref> below illustrates how cross validation works in training and validation of our classification models.</p>
				<p>
					<fig id="f5">
						<label>Figure 5</label>
						<caption>
							<title>Cross validation scheme for model training and validation</title>
						</caption>
						<graphic xlink:href="2665-427X-ijeph-7-01-e-11532-gf5.png"/>
					</fig>
				</p>
			</sec>
			<sec>
				<title>Model Evaluation Metrics</title>
				<p>In this section, we define and discuss key evaluation metrics used for assessing the performance of classification models.</p>
				<sec>
					<title>Accuracy</title>
					<p>Accuracy is a measure of the overall correctness of the model and is defined as the ratio of correctly predicted instances to the total instances.</p>
					<p>
						<disp-formula id="e11">
							<mml:math>
								<mml:mi mathvariant="normal">A</mml:mi>
								<mml:mi mathvariant="normal">c</mml:mi>
								<mml:mi mathvariant="normal">c</mml:mi>
								<mml:mi mathvariant="normal">u</mml:mi>
								<mml:mi mathvariant="normal">r</mml:mi>
								<mml:mi mathvariant="normal">a</mml:mi>
								<mml:mi mathvariant="normal">c</mml:mi>
								<mml:mi mathvariant="normal">y</mml:mi>
								<mml:mo>=</mml:mo>
								<mml:mi> </mml:mi>
								<mml:mfrac>
									<mml:mrow>
										<mml:mi mathvariant="normal">N</mml:mi>
										<mml:mi mathvariant="normal">u</mml:mi>
										<mml:mi mathvariant="normal">m</mml:mi>
										<mml:mi mathvariant="normal">b</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal">r</mml:mi>
										<mml:mi mathvariant="normal"> </mml:mi>
										<mml:mi mathvariant="normal">o</mml:mi>
										<mml:mi mathvariant="normal">f</mml:mi>
										<mml:mi mathvariant="normal"> </mml:mi>
										<mml:mi mathvariant="normal">C</mml:mi>
										<mml:mi mathvariant="normal">o</mml:mi>
										<mml:mi mathvariant="normal">r</mml:mi>
										<mml:mi mathvariant="normal">r</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal">c</mml:mi>
										<mml:mi mathvariant="normal">t</mml:mi>
										<mml:mi mathvariant="normal"> </mml:mi>
										<mml:mi mathvariant="normal">P</mml:mi>
										<mml:mi mathvariant="normal">r</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal">d</mml:mi>
										<mml:mi mathvariant="normal">i</mml:mi>
										<mml:mi mathvariant="normal">c</mml:mi>
										<mml:mi mathvariant="normal">t</mml:mi>
										<mml:mi mathvariant="normal">i</mml:mi>
										<mml:mi mathvariant="normal">o</mml:mi>
										<mml:mi mathvariant="normal">n</mml:mi>
										<mml:mi mathvariant="normal">s</mml:mi>
									</mml:mrow>
									<mml:mrow>
										<mml:mi mathvariant="normal">T</mml:mi>
										<mml:mi mathvariant="normal">o</mml:mi>
										<mml:mi mathvariant="normal">t</mml:mi>
										<mml:mi mathvariant="normal">a</mml:mi>
										<mml:mi mathvariant="normal">l</mml:mi>
										<mml:mi mathvariant="normal"> </mml:mi>
										<mml:mi mathvariant="normal">N</mml:mi>
										<mml:mi mathvariant="normal">u</mml:mi>
										<mml:mi mathvariant="normal">m</mml:mi>
										<mml:mi mathvariant="normal">b</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal">r</mml:mi>
										<mml:mi mathvariant="normal"> </mml:mi>
										<mml:mi mathvariant="normal">o</mml:mi>
										<mml:mi mathvariant="normal">f</mml:mi>
										<mml:mi mathvariant="normal"> </mml:mi>
										<mml:mi mathvariant="normal">P</mml:mi>
										<mml:mi mathvariant="normal">r</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal">d</mml:mi>
										<mml:mi mathvariant="normal">i</mml:mi>
										<mml:mi mathvariant="normal">c</mml:mi>
										<mml:mi mathvariant="normal">t</mml:mi>
										<mml:mi mathvariant="normal">i</mml:mi>
										<mml:mi mathvariant="normal">o</mml:mi>
										<mml:mi mathvariant="normal">n</mml:mi>
									</mml:mrow>
								</mml:mfrac>
							</mml:math>
						</disp-formula>
					</p>
				</sec>
				<sec>
					<title>Balanced Accuracy</title>
					<p>Balanced Accuracy considers class imbalance by considering the average of sensitivity (true positive rate) across different classes. It is particularly useful when classes are unevenly distributed.</p>
					<p>
						<disp-formula id="e12">
							<mml:math>
								<mml:mi mathvariant="normal">B</mml:mi>
								<mml:mi mathvariant="normal">a</mml:mi>
								<mml:mi mathvariant="normal">l</mml:mi>
								<mml:mi mathvariant="normal">a</mml:mi>
								<mml:mi mathvariant="normal">n</mml:mi>
								<mml:mi mathvariant="normal">c</mml:mi>
								<mml:mi mathvariant="normal">e</mml:mi>
								<mml:mi mathvariant="normal">d</mml:mi>
								<mml:mi mathvariant="normal"> </mml:mi>
								<mml:mi mathvariant="normal">A</mml:mi>
								<mml:mi mathvariant="normal">c</mml:mi>
								<mml:mi mathvariant="normal">c</mml:mi>
								<mml:mi mathvariant="normal">u</mml:mi>
								<mml:mi mathvariant="normal">r</mml:mi>
								<mml:mi mathvariant="normal">a</mml:mi>
								<mml:mi mathvariant="normal">c</mml:mi>
								<mml:mi mathvariant="normal">y</mml:mi>
								<mml:mo>=</mml:mo>
								<mml:mi> </mml:mi>
								<mml:mfrac>
									<mml:mrow>
										<mml:msub>
											<mml:mrow>
												<mml:mi>S</mml:mi>
												<mml:mi>e</mml:mi>
												<mml:mi>n</mml:mi>
												<mml:mi>s</mml:mi>
												<mml:mi>i</mml:mi>
												<mml:mi>v</mml:mi>
												<mml:mi>i</mml:mi>
												<mml:mi>t</mml:mi>
												<mml:mi>y</mml:mi>
											</mml:mrow>
											<mml:mrow>
												<mml:mn>1</mml:mn>
											</mml:mrow>
										</mml:msub>
										<mml:mo>+</mml:mo>
										<mml:msub>
											<mml:mrow>
												<mml:mi>S</mml:mi>
												<mml:mi>e</mml:mi>
												<mml:mi>n</mml:mi>
												<mml:mi>s</mml:mi>
												<mml:mi>i</mml:mi>
												<mml:mi>v</mml:mi>
												<mml:mi>i</mml:mi>
												<mml:mi>t</mml:mi>
												<mml:mi>y</mml:mi>
											</mml:mrow>
											<mml:mrow>
												<mml:mn>2</mml:mn>
											</mml:mrow>
										</mml:msub>
										<mml:mo>+</mml:mo>
										<mml:mo>⋯</mml:mo>
										<mml:mo>+</mml:mo>
										<mml:msub>
											<mml:mrow>
												<mml:mi>S</mml:mi>
												<mml:mi>e</mml:mi>
												<mml:mi>n</mml:mi>
												<mml:mi>s</mml:mi>
												<mml:mi>i</mml:mi>
												<mml:mi>v</mml:mi>
												<mml:mi>i</mml:mi>
												<mml:mi>t</mml:mi>
												<mml:mi>y</mml:mi>
											</mml:mrow>
											<mml:mrow>
												<mml:mi>k</mml:mi>
											</mml:mrow>
										</mml:msub>
									</mml:mrow>
									<mml:mrow>
										<mml:mi>k</mml:mi>
									</mml:mrow>
								</mml:mfrac>
							</mml:math>
						</disp-formula>
					</p>
				</sec>
				<sec>
					<title>Sensitivity (Recall)</title>
					<p>Sensitivity, also known as Recall or True Positive Rate, measures the ability of a model to correctly identify positive instances.</p>
					<p>
						<disp-formula id="e13">
							<mml:math>
								<mml:mi mathvariant="normal">S</mml:mi>
								<mml:mi mathvariant="normal">e</mml:mi>
								<mml:mi mathvariant="normal">n</mml:mi>
								<mml:mi mathvariant="normal">s</mml:mi>
								<mml:mi mathvariant="normal">i</mml:mi>
								<mml:mi mathvariant="normal">v</mml:mi>
								<mml:mi mathvariant="normal">i</mml:mi>
								<mml:mi mathvariant="normal">t</mml:mi>
								<mml:mi mathvariant="normal">y</mml:mi>
								<mml:mo>=</mml:mo>
								<mml:mi> </mml:mi>
								<mml:mfrac>
									<mml:mrow>
										<mml:mi mathvariant="normal">T</mml:mi>
										<mml:mi mathvariant="normal">r</mml:mi>
										<mml:mi mathvariant="normal">u</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal"> </mml:mi>
										<mml:mi mathvariant="normal">P</mml:mi>
										<mml:mi mathvariant="normal">o</mml:mi>
										<mml:mi mathvariant="normal">s</mml:mi>
										<mml:mi mathvariant="normal">i</mml:mi>
										<mml:mi mathvariant="normal">t</mml:mi>
										<mml:mi mathvariant="normal">i</mml:mi>
										<mml:mi mathvariant="normal">v</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal">s</mml:mi>
									</mml:mrow>
									<mml:mrow>
										<mml:mi mathvariant="normal">T</mml:mi>
										<mml:mi mathvariant="normal">r</mml:mi>
										<mml:mi mathvariant="normal">u</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal"> </mml:mi>
										<mml:mi mathvariant="normal">P</mml:mi>
										<mml:mi mathvariant="normal">o</mml:mi>
										<mml:mi mathvariant="normal">s</mml:mi>
										<mml:mi mathvariant="normal">i</mml:mi>
										<mml:mi mathvariant="normal">t</mml:mi>
										<mml:mi mathvariant="normal">i</mml:mi>
										<mml:mi mathvariant="normal">v</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal">s</mml:mi>
										<mml:mo>+</mml:mo>
										<mml:mi mathvariant="normal">F</mml:mi>
										<mml:mi mathvariant="normal">a</mml:mi>
										<mml:mi mathvariant="normal">l</mml:mi>
										<mml:mi mathvariant="normal">s</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal"> </mml:mi>
										<mml:mi mathvariant="normal">N</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal">g</mml:mi>
										<mml:mi mathvariant="normal">a</mml:mi>
										<mml:mi mathvariant="normal">t</mml:mi>
										<mml:mi mathvariant="normal">i</mml:mi>
										<mml:mi mathvariant="normal">v</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal">s</mml:mi>
									</mml:mrow>
								</mml:mfrac>
							</mml:math>
						</disp-formula>
					</p>
				</sec>
				<sec>
					<title>Specificity</title>
					<p>Specificity measures the ability of a model to correctly identify negative instances.</p>
					<p>
						<disp-formula id="e14">
							<mml:math>
								<mml:mi mathvariant="normal">S</mml:mi>
								<mml:mi mathvariant="normal">p</mml:mi>
								<mml:mi mathvariant="normal">e</mml:mi>
								<mml:mi mathvariant="normal">c</mml:mi>
								<mml:mi mathvariant="normal">i</mml:mi>
								<mml:mi mathvariant="normal">f</mml:mi>
								<mml:mi mathvariant="normal">i</mml:mi>
								<mml:mi mathvariant="normal">c</mml:mi>
								<mml:mi mathvariant="normal">i</mml:mi>
								<mml:mi mathvariant="normal">t</mml:mi>
								<mml:mi mathvariant="normal">y</mml:mi>
								<mml:mo>=</mml:mo>
								<mml:mi> </mml:mi>
								<mml:mfrac>
									<mml:mrow>
										<mml:mi mathvariant="normal">T</mml:mi>
										<mml:mi mathvariant="normal">r</mml:mi>
										<mml:mi mathvariant="normal">u</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal"> </mml:mi>
										<mml:mi mathvariant="normal">N</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal">g</mml:mi>
										<mml:mi mathvariant="normal">a</mml:mi>
										<mml:mi mathvariant="normal">t</mml:mi>
										<mml:mi mathvariant="normal">i</mml:mi>
										<mml:mi mathvariant="normal">v</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal">s</mml:mi>
									</mml:mrow>
									<mml:mrow>
										<mml:mi mathvariant="normal">T</mml:mi>
										<mml:mi mathvariant="normal">r</mml:mi>
										<mml:mi mathvariant="normal">u</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal"> </mml:mi>
										<mml:mi mathvariant="normal">N</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal">g</mml:mi>
										<mml:mi mathvariant="normal">a</mml:mi>
										<mml:mi mathvariant="normal">t</mml:mi>
										<mml:mi mathvariant="normal">i</mml:mi>
										<mml:mi mathvariant="normal">v</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal">s</mml:mi>
										<mml:mo>+</mml:mo>
										<mml:mi mathvariant="normal">F</mml:mi>
										<mml:mi mathvariant="normal">a</mml:mi>
										<mml:mi mathvariant="normal">l</mml:mi>
										<mml:mi mathvariant="normal">s</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal"> </mml:mi>
										<mml:mi mathvariant="normal">P</mml:mi>
										<mml:mi mathvariant="normal">o</mml:mi>
										<mml:mi mathvariant="normal">s</mml:mi>
										<mml:mi mathvariant="normal">i</mml:mi>
										<mml:mi mathvariant="normal">t</mml:mi>
										<mml:mi mathvariant="normal">i</mml:mi>
										<mml:mi mathvariant="normal">v</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal">s</mml:mi>
									</mml:mrow>
								</mml:mfrac>
							</mml:math>
						</disp-formula>
					</p>
				</sec>
				<sec>
					<title>Precision</title>
					<p>Precision measures the accuracy of positive predictions and is defined as the ratio of true positives to the sum of true positives and false positives.</p>
					<p>
						<disp-formula id="e15">
							<mml:math>
								<mml:mi mathvariant="normal">P</mml:mi>
								<mml:mi mathvariant="normal">r</mml:mi>
								<mml:mi mathvariant="normal">e</mml:mi>
								<mml:mi mathvariant="normal">c</mml:mi>
								<mml:mi mathvariant="normal">i</mml:mi>
								<mml:mi mathvariant="normal">s</mml:mi>
								<mml:mi mathvariant="normal">i</mml:mi>
								<mml:mi mathvariant="normal">o</mml:mi>
								<mml:mi mathvariant="normal">n</mml:mi>
								<mml:mo>=</mml:mo>
								<mml:mi> </mml:mi>
								<mml:mfrac>
									<mml:mrow>
										<mml:mi mathvariant="normal">T</mml:mi>
										<mml:mi mathvariant="normal">r</mml:mi>
										<mml:mi mathvariant="normal">u</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal"> </mml:mi>
										<mml:mi mathvariant="normal">P</mml:mi>
										<mml:mi mathvariant="normal">o</mml:mi>
										<mml:mi mathvariant="normal">s</mml:mi>
										<mml:mi mathvariant="normal">i</mml:mi>
										<mml:mi mathvariant="normal">t</mml:mi>
										<mml:mi mathvariant="normal">i</mml:mi>
										<mml:mi mathvariant="normal">v</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal">s</mml:mi>
									</mml:mrow>
									<mml:mrow>
										<mml:mi mathvariant="normal">T</mml:mi>
										<mml:mi mathvariant="normal">r</mml:mi>
										<mml:mi mathvariant="normal">u</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal"> </mml:mi>
										<mml:mi mathvariant="normal">P</mml:mi>
										<mml:mi mathvariant="normal">o</mml:mi>
										<mml:mi mathvariant="normal">s</mml:mi>
										<mml:mi mathvariant="normal">i</mml:mi>
										<mml:mi mathvariant="normal">t</mml:mi>
										<mml:mi mathvariant="normal">i</mml:mi>
										<mml:mi mathvariant="normal">v</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal">s</mml:mi>
										<mml:mo>+</mml:mo>
										<mml:mi mathvariant="normal">F</mml:mi>
										<mml:mi mathvariant="normal">a</mml:mi>
										<mml:mi mathvariant="normal">l</mml:mi>
										<mml:mi mathvariant="normal">s</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal"> </mml:mi>
										<mml:mi mathvariant="normal">P</mml:mi>
										<mml:mi mathvariant="normal">o</mml:mi>
										<mml:mi mathvariant="normal">s</mml:mi>
										<mml:mi mathvariant="normal">i</mml:mi>
										<mml:mi mathvariant="normal">t</mml:mi>
										<mml:mi mathvariant="normal">i</mml:mi>
										<mml:mi mathvariant="normal">v</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal">s</mml:mi>
									</mml:mrow>
								</mml:mfrac>
							</mml:math>
						</disp-formula>
					</p>
				</sec>
				<sec>
					<title>F1-Score</title>
					<p>F1-Score is the harmonic mean of precision and recall, providing a balanced measure of a model’s performance.</p>
					<p>
						<disp-formula id="e16">
							<mml:math>
								<mml:mi mathvariant="normal">F</mml:mi>
								<mml:mn>1</mml:mn>
								<mml:mo>-</mml:mo>
								<mml:mi mathvariant="normal">S</mml:mi>
								<mml:mi mathvariant="normal">c</mml:mi>
								<mml:mi mathvariant="normal">o</mml:mi>
								<mml:mi mathvariant="normal">r</mml:mi>
								<mml:mi mathvariant="normal">e</mml:mi>
								<mml:mo>=</mml:mo>
								<mml:mi> </mml:mi>
								<mml:mfrac>
									<mml:mrow>
										<mml:mn>2</mml:mn>
										<mml:mi mathvariant="normal"> </mml:mi>
										<mml:mi mathvariant="normal">x</mml:mi>
										<mml:mi mathvariant="normal"> </mml:mi>
										<mml:mi mathvariant="normal">P</mml:mi>
										<mml:mi mathvariant="normal">r</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal">c</mml:mi>
										<mml:mi mathvariant="normal">i</mml:mi>
										<mml:mi mathvariant="normal">s</mml:mi>
										<mml:mi mathvariant="normal">i</mml:mi>
										<mml:mi mathvariant="normal">o</mml:mi>
										<mml:mi mathvariant="normal">n</mml:mi>
										<mml:mi mathvariant="normal"> </mml:mi>
										<mml:mi mathvariant="normal">x</mml:mi>
										<mml:mi mathvariant="normal"> </mml:mi>
										<mml:mi mathvariant="normal">R</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal">c</mml:mi>
										<mml:mi mathvariant="normal">a</mml:mi>
										<mml:mi mathvariant="normal">l</mml:mi>
										<mml:mi mathvariant="normal">l</mml:mi>
									</mml:mrow>
									<mml:mrow>
										<mml:mi mathvariant="normal">P</mml:mi>
										<mml:mi mathvariant="normal">r</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal">c</mml:mi>
										<mml:mi mathvariant="normal">i</mml:mi>
										<mml:mi mathvariant="normal">s</mml:mi>
										<mml:mi mathvariant="normal">i</mml:mi>
										<mml:mi mathvariant="normal">o</mml:mi>
										<mml:mi mathvariant="normal">n</mml:mi>
										<mml:mi mathvariant="normal"> </mml:mi>
										<mml:mi mathvariant="normal">x</mml:mi>
										<mml:mi mathvariant="normal"> </mml:mi>
										<mml:mi mathvariant="normal">R</mml:mi>
										<mml:mi mathvariant="normal">e</mml:mi>
										<mml:mi mathvariant="normal">c</mml:mi>
										<mml:mi mathvariant="normal">a</mml:mi>
										<mml:mi mathvariant="normal">l</mml:mi>
										<mml:mi mathvariant="normal">l</mml:mi>
									</mml:mrow>
								</mml:mfrac>
							</mml:math>
						</disp-formula>
					</p>
				</sec>
			</sec>
		</sec>
		<sec sec-type="results">
			<title>Results</title>
			<sec>
				<title>Target variable Distribution</title>
				<p>The distribution of the target variable, ’obese,’ reveals important insights into the composition of the dataset. In our analysis, the target variable represents whether an individual is classified as obese or not. The distribution of this variable is crucial for understanding the prevalence of obesity within the dataset. Upon examining the distribution, we observe that the dataset exhibits an imbalanced distribution in terms of the ’obese’ classes. There are two possible classes: ’Not Obese’ and ’Obese.’ The imbalanced distribution implies that one class significantly outnumbers the other. The <xref ref-type="fig" rid="f6">Figure 6</xref> shows the plot of target variable Distribution before SMOTE.</p>
				<p>
					<fig id="f6">
						<label>Figure 6</label>
						<caption>
							<title>Class distribution of the response variable before SMOTE</title>
						</caption>
						<graphic xlink:href="2665-427X-ijeph-7-01-e-11532-gf6.png"/>
					</fig>
				</p>
				<p>To address the class imbalance and enhance the model’s ability to accurately predict both classes, oversampling techniques such as Synthetic Minority Over-sampling Technique (SMOTE) were employed. SMOTE generates synthetic instances of the minority class by interpolating between existing instances. This augmentation helps balance the class distribution and improves the model’s generalization to the minority class, ultimately contributing to more robust predictions. <xref ref-type="fig" rid="f7">Figure 7</xref> shows the class distribution of the target after SMOTE.</p>
				<p>
					<fig id="f7">
						<label>Figure 7</label>
						<caption>
							<title>Class distribution of the response variable after SMOTE</title>
						</caption>
						<graphic xlink:href="2665-427X-ijeph-7-01-e-11532-gf7.png"/>
					</fig>
				</p>
				<p>In the subsequent sections, we will assess the impact of before oversampling and after oversampling on model performance through various metrics and visualizations.</p>
				<p>
					<xref ref-type="table" rid="t3">Table 2</xref> shows the performance of the classification models on various metrics before SMOTE as percentage. <xref ref-type="table" rid="t4">Table 3</xref> shows the performance of the classification models after SMOTE. <xref ref-type="table" rid="t5">Table 4</xref> show the metric improvement of the classification models as percentage. <xref ref-type="fig" rid="f9">Figure 9</xref> shows visualization of metrics deviation after SMOTE and <xref ref-type="fig" rid="f8">Figure 8</xref> show visualization of the metrics before and after SMOTE.</p>
				<p>
					<table-wrap id="t3">
						<label>Table 2</label>
						<caption>
							<title>Performance Metrics Before SMOTE (as percentages)</title>
						</caption>
						<table>
							<colgroup>
								<col/>
								<col/>
								<col/>
								<col/>
								<col/>
								<col/>
								<col/>
							</colgroup>
							<thead>
								<tr>
									<th align="left">Model</th>
									<th align="right">Accuracy</th>
									<th align="right">Balanced Accuracy</th>
									<th align="right">Sensitivity</th>
									<th align="right">Specificity</th>
									<th align="right">Precision</th>
									<th align="right">F1-Score</th>
								</tr>
							</thead>
							<tbody>
								<tr>
									<td align="left">Logistic Regression</td>
									<td align="right">80.8</td>
									<td align="right">55.8</td>
									<td align="right">18.7</td>
									<td align="right">18.7</td>
									<td align="right">28.3</td>
									<td align="right">22.4</td>
								</tr>
								<tr>
									<td align="left">Naive Bayes</td>
									<td align="right">76.7</td>
									<td align="right">62.2</td>
									<td align="right">39.3</td>
									<td align="right">39.3</td>
									<td align="right">35.0</td>
									<td align="right">34.4</td>
								</tr>
								<tr>
									<td align="left">KNN (k=5)</td>
									<td align="right">80.0</td>
									<td align="right">48.0</td>
									<td align="right">0.0</td>
									<td align="right">0.0</td>
									<td align="right">0.0</td>
									<td align="right">0.0</td>
								</tr>
								<tr>
									<td align="left">Deep Learning</td>
									<td align="right">76.0</td>
									<td align="right">53.4</td>
									<td align="right">19.3</td>
									<td align="right">87.4</td>
									<td align="right">32.4</td>
									<td align="right">19.7</td>
								</tr>
							</tbody>
						</table>
					</table-wrap>
				</p>
				<p>
					<table-wrap id="t4">
						<label>Table 3</label>
						<caption>
							<title>Performance Metrics After SMOTE (as percentages)</title>
						</caption>
						<table>
							<colgroup>
								<col/>
								<col/>
								<col/>
								<col/>
								<col/>
								<col/>
								<col/>
							</colgroup>
							<thead>
								<tr>
									<th align="left">Model</th>
									<th align="right">Accuracy</th>
									<th align="right">Balanced Accuracy</th>
									<th align="right">Sensitivity</th>
									<th align="right">Specificity</th>
									<th align="right">Precision</th>
									<th align="right">F1-Score</th>
								</tr>
							</thead>
							<tbody>
								<tr>
									<td align="left">Logistic Regression</td>
									<td align="right">72.0</td>
									<td align="right">72.4</td>
									<td align="right">75.4</td>
									<td align="right">75.4</td>
									<td align="right">71.7</td>
									<td align="right">72.2</td>
								</tr>
								<tr>
									<td align="left">Naive Bayes</td>
									<td align="right">79.0</td>
									<td align="right">78.9</td>
									<td align="right">87.2</td>
									<td align="right">87.2</td>
									<td align="right">75.0</td>
									<td align="right">80.5</td>
								</tr>
								<tr>
									<td align="left">KNN (k=5)</td>
									<td align="right">75.5</td>
									<td align="right">76.3</td>
									<td align="right">96.0</td>
									<td align="right">96.0</td>
									<td align="right">68.9</td>
									<td align="right">79.6</td>
								</tr>
								<tr>
									<td align="left">Deep Learning</td>
									<td align="right">82.5</td>
									<td align="right">82.8</td>
									<td align="right">89.1</td>
									<td align="right">76.4</td>
									<td align="right">78.4</td>
									<td align="right">83.2</td>
								</tr>
							</tbody>
						</table>
					</table-wrap>
				</p>
				<p>
					<table-wrap id="t5">
						<label>Table 4</label>
						<caption>
							<title>Deviance in Metrics</title>
						</caption>
						<table>
							<colgroup>
								<col/>
								<col/>
								<col/>
								<col/>
								<col/>
								<col/>
								<col/>
							</colgroup>
							<thead>
								<tr>
									<th align="left">Model</th>
									<th align="center">Accuracy</th>
									<th align="center">Balanced Accuracy</th>
									<th align="center">Sensitivity</th>
									<th align="center">Specificity</th>
									<th align="center">Precision</th>
									<th align="center">F1-Score</th>
								</tr>
							</thead>
							<tbody>
								<tr>
									<td align="left">Logistic Regression</td>
									<td align="center">-8.8</td>
									<td align="center">+16.6</td>
									<td align="center">+56.7</td>
									<td align="center">+56.7</td>
									<td align="center">+43.4</td>
									<td align="center">+47.8</td>
								</tr>
								<tr>
									<td align="left">Naive Bayes</td>
									<td align="center">+2.3</td>
									<td align="center">+16.7</td>
									<td align="center">+47.9</td>
									<td align="center">+47.9</td>
									<td align="center">+40.0</td>
									<td align="center">+46.1</td>
								</tr>
								<tr>
									<td align="left">KNN (k=5)</td>
									<td align="center">-4.5</td>
									<td align="center">+28.3</td>
									<td align="center">+96.0</td>
									<td align="center">+96.0</td>
									<td align="center">+68.9</td>
									<td align="center">+79.6</td>
								</tr>
								<tr>
									<td align="left">Deep Learning</td>
									<td align="center">+6.5</td>
									<td align="center">+29.4</td>
									<td align="center">+69.8</td>
									<td align="center">-10.9</td>
									<td align="center">+45.9</td>
									<td align="center">+66.7</td>
								</tr>
							</tbody>
						</table>
					</table-wrap>
				</p>
				<p>
					<fig id="f8">
						<label>Figure 8</label>
						<caption>
							<title>Visualization of metrics deviation after SMOTE</title>
						</caption>
						<graphic xlink:href="2665-427X-ijeph-7-01-e-11532-gf8.png"/>
					</fig>
				</p>
				<p>
					<fig id="f9">
						<label>Figure 9</label>
						<caption>
							<title>Visualization of metrics before and after SMOTE</title>
						</caption>
						<graphic xlink:href="2665-427X-ijeph-7-01-e-11532-gf9.png"/>
					</fig>
				</p>
			</sec>
		</sec>
		<sec sec-type="discussion">
			<title>Discussion</title>
			<p>The Logistic Regression model initially performed reasonably well before SMOTE, achieving an accuracy of 80.8%. However, after applying SMOTE, there was a notable decrease in accuracy by -8.8%. Despite this decrease, the model’s performance in terms of sensitivity and specificity showed significant improvements. Sensitivity, which measures the ability to correctly identify obese individuals, increased substantially by +56.7% after SMOTE, indicating that the model became more effective at capturing true positives within the obese class. Similarly, specificity, representing the ability to correctly identify non-obese individuals, also saw a considerable improvement of +56.7% after SMOTE, suggesting a reduction in the false positive rate among non-obese individuals. These enhancements in sensitivity and specificity demonstrate the efficacy of SMOTE in addressing the class imbalance issue inherent in the dataset and improving the model’s ability to classify both classes accurately.</p>
			<p>Moreover, the balanced accuracy of the Logistic Regression model increased significantly by +16.6% after SMOTE, indicating a more balanced performance across both classes. The substantial improvement in precision by +43.4% and F1-score by +47.8% after SMOTE further underscores the effectiveness of SMOTE in enhancing the model’s predictive capability. Precision reflects the model’s ability to correctly classify positive predictions, while the F1-score balances both precision and recall, providing a robust measure of overall model performance. The notable improvements in these metrics suggest that SMOTE facilitated a more accurate and reliable classification of individuals into obese and non-obese categories, contributing to the overall enhancement in model performance.</p>
			<p>Overall, the results demonstrate that SMOTE had a significant positive impact on the performance of the Logistic Regression model across various evaluation metrics. While there was a slight decrease in overall accuracy, the model showed substantial improvements in sensitivity, specificity, balanced accuracy, precision, and F1-score after applying SMOTE. These improvements highlight the effectiveness of SMOTE in addressing the class imbalance issue inherent in the dataset, enabling the Logistic Regression model to achieve better classification results and make more accurate predictions regarding individuals’ obesity status.</p>
			<p>Before applying SMOTE, the Naive Bayes classifier exhibited a moderate level of accuracy, achieving 76.7%. However, after implementing SMOTE, there was a modest increase in accuracy by +2.3%. More notably, the model’s performance in terms of sensitivity and specificity experienced substantial improvements. Sensitivity, representing the ability to correctly identify obese individuals, increased significantly by +47.9% after SMOTE. This enhancement indicates that the model became more effective at capturing true positives within the obese class. Similarly, specificity, which measures the ability to correctly identify non-obese individuals, also saw a considerable improvement of +47.9% after SMOTE, suggesting a reduction in false positives among non-obese individuals. These enhancements underscore the effectiveness of SMOTE in addressing the class imbalance issue and improving the Naive Bayes model’s classification accuracy.</p>
			<p>Moreover, the balanced accuracy of the Naive Bayes model increased notably by +16.7% after SMOTE, indicating a more balanced performance across both classes. Additionally, there were significant improvements in precision by +40.0% and F1-score by +46.1% after SMOTE. Precision reflects the model’s ability to correctly classify positive predictions, while the F1-score provides a balanced measure of both precision and recall. The considerable improvements in these metrics suggest that SMOTE facilitated a more accurate and reliable classification of individuals into obese and non-obese categories, contributing to the overall enhancement in model performance.</p>
			<p>The application of SMOTE had a positive impact on the performance of the Naive Bayes classifier across various evaluation metrics. While there was a modest increase in accuracy, the model demonstrated substantial improvements in sensitivity, specificity, balanced accuracy, precision, and F1-score after applying SMOTE. These improvements highlight the efficacy of SMOTE in mitigating the effects of class imbalance and improving the Naive Bayes model’s ability to accurately classify individuals based on their obesity status.</p>
			<p>The KNN (<italic>k</italic>= 5) classifier exhibited strong sensitivity at 96.0% but struggled with specificity, which was notably low at 0.0%. This indicates its difficulty in correctly identifying non-obese individuals.</p>
			<p>Remarkable improvements were observed across all metrics after implementing SMOTE. Both sensitivity and specificity increased significantly by +96.0%, indicating substantial enhancement in correctly identifying both obese and non-obese individuals. This suggests effective mitigation of the class imbalance issue by SMOTE, resulting in better classification accuracy.</p>
			<p>Post-SMOTE implementation, the KNN model showcased a transformative improvement in performance. Initially challenged by low specificity, SMOTE led to significant enhancements in sensitivity, specificity, balanced accuracy (+28.3%), precision (+68.9%), and F1-score (+79.6%). These enhancements underscore the effectiveness of SMOTE in addressing class imbalance and improving the KNN model’s overall classification performance.</p>
			<p>Before the application of SMOTE, the deep learning model exhibited promising performance, particularly in specificity, achieving a high value of 87.4%. However, the model’s sensitivity was relatively low, with a value of 19.3%, indicating its struggle to correctly identify obese individuals. After the implementation of SMOTE, there was a notable improvement in the model’s performance across most metrics. Sensitivity increased significantly by +69.8%, indicating a substantial enhancement in the model’s ability to correctly identify obese individuals. However, there was a slight decrease in specificity by -10.9%, suggesting a higher rate of false positives among non-obese individuals. Despite this slight trade-off, the overall performance of the deep learning model improved significantly after SMOTE.</p>
			<p>Moreover, the balanced accuracy of the deep learning model increased substantially by +29.4% after SMOTE, indicating a more balanced performance across both classes. Additionally, there were significant enhancements in precision by +45.9% and F1-score by +66.7% after SMOTE. These improvements reflect the model’s improved ability to accurately classify positive predictions and achieve a balance between precision and recall. Despite the slight decrease in specificity, the substantial improvements in sensitivity, balanced accuracy, precision, and F1-score highlight the effectiveness of SMOTE in improving the overall classification performance of the deep learning model.</p>
			<p>The application of SMOTE had a positive impact on the performance of the deep learning model. While there was a slight trade-off in specificity, the model exhibited significant improvements in sensitivity, balanced accuracy, precision, and F1-score after applying SMOTE. These improvements demonstrate the efficacy of SMOTE in addressing the class imbalance issue and enhancing the deep learning model’s ability to accurately classify individuals based on their obesity status.</p>
			<sec>
				<title>Comparison with other studies</title>
				<p>Tree-based machine learning approaches, including Logistic Regression, Random Forest (RF), and Extreme Gradient Boosting (XGBoost), were employed to classify obesity levels <xref ref-type="bibr" rid="B38"><sup>38</sup></xref>. The study found that LR performed best across most metrics after addressing class imbalance using SMOTE-NC and feature selection via Recursive Feature Elimination. Our findings similarly highlight the robustness of LR in the context of obesity prediction, with SMOTE significantly enhancing various performance metrics. A study <xref ref-type="bibr" rid="B9"><sup>9</sup></xref> focused on preprocessing an obesity dataset and used SVM, RF, and Decision Trees for classification. The RF model showed the highest prediction accuracy (96%). This aligns with our findings, which show that RF is effective in obesity prediction, though our use of SMOTE further improved the model’s performance metrics. Assessed ML methods, including Logistic Regression, Classification and Regression Trees, and Naive Bayes, to predict obesity <xref ref-type="bibr" rid="B39"><sup>39</sup></xref>. Logistic regression showed the highest performance, which is consistent with our study. The application of SMOTE in our research also addressed data imbalance effectively, leading to significant improvements in sensitivity, specificity, and balanced accuracy.</p>
			</sec>
		</sec>
		<sec sec-type="conclusions">
			<title>Conclusions</title>
			<p>The application of SMOTE has demonstrated significant improvements in the performance of various classification models for predicting obesity status. The results indicate that SMOTE effectively addressed the class imbalance issue present in the dataset, leading to enhanced model performance across multiple evaluation metrics. Specifically, SMOTE substantially improved sensitivity, specificity, balanced accuracy, precision, and F1-score for all models evaluated. This suggests that SMOTE successfully balanced the distribution of the minority class, allowing the models to better capture the underlying patterns and make more accurate predictions.</p>
			<p>Despite the overall improvements, there are some caveats to consider. While SMOTE improved the classification performance of most models, it also resulted in a decrease in specificity for the deep learning model. This suggests a higher rate of false positives among non-obese individuals, indicating potential areas for improvement. Additionally, the effectiveness of SMOTE may vary depending on the specific characteristics of the dataset and the chosen classification algorithm. Further experimentation and fine-tuning may be necessary to optimize the performance of the models and address any remaining challenges.</p>
			<p>Future research could explore alternative methods for addressing class imbalance issues beyond SMOTE, such as ensemble techniques or data augmentation strategies. Additionally, conducting a more comprehensive analysis of feature importance and model interpretability could provide valuable insights into the factors influencing obesity prediction and help identify areas for further refinement. Overall, while SMOTE has proven to be a valuable tool for improving classification performance in imbalanced datasets, continued research, and experimentation are essential to further advance the field of obesity prediction and enhance the effectiveness of predictive models in healthcare applications.</p>
			<def-list id="d1">
				<title>Abbreviations</title>
				<def-item>
					<term>ANN</term>
					<def>
						<p>Artificial Neural Network</p>
					</def>
				</def-item>
				<def-item>
					<term>BMI</term>
					<def>
						<p>Body Mass Index</p>
					</def>
				</def-item>
				<def-item>
					<term>KNN</term>
					<def>
						<p>K-Nearest Neighbors </p>
					</def>
				</def-item>
				<def-item>
					<term>SVM</term>
					<def>
						<p>Support Vector Machine</p>
					</def>
				</def-item>
				<def-item>
					<term>SMOTE</term>
					<def>
						<p>Synthetic Minority Over-sampling Technique</p>
					</def>
				</def-item>
				<def-item>
					<term>ML</term>
					<def>
						<p>Machine Learning</p>
					</def>
				</def-item>
			</def-list>
		</sec>
	</body>
	<back>
		<ref-list>
			<title>References</title>
			<ref id="B1">
				<label>1</label>
				<mixed-citation>1. Peter G Kopelman. Obesity as a medical problem. Nature. 2000; 404(6778): 635-643.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Peter Kopelman</surname>
							<given-names>G</given-names>
						</name>
					</person-group>
					<article-title>Obesity as a medical problem</article-title>
					<source>Nature</source>
					<year>2000</year>
					<volume>404</volume>
					<issue>6778</issue>
					<fpage>635</fpage>
					<lpage>643</lpage>
				</element-citation>
			</ref>
			<ref id="B2">
				<label>2</label>
				<mixed-citation>2. Ng M, Fleming T, Robinson M, Thomson B, Graetz N, Margono C, et al. Global, regional, and national prevalence of overweight and obesity in children and adults during 1980-2013: a systematic analysis for the global burden of disease study 2013. Lancet. 2014; 384(9945): 766-781.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Ng</surname>
							<given-names>M</given-names>
						</name>
						<name>
							<surname>Fleming</surname>
							<given-names>T</given-names>
						</name>
						<name>
							<surname>Robinson</surname>
							<given-names>M</given-names>
						</name>
						<name>
							<surname>Thomson</surname>
							<given-names>B</given-names>
						</name>
						<name>
							<surname>Graetz</surname>
							<given-names>N</given-names>
						</name>
						<name>
							<surname>Margono</surname>
							<given-names>C</given-names>
						</name>
						<etal/>
					</person-group>
					<article-title>Global, regional, and national prevalence of overweight and obesity in children and adults during 1980-2013: a systematic analysis for the global burden of disease study 2013</article-title>
					<source>Lancet</source>
					<year>2014</year>
					<volume>384</volume>
					<issue>9945</issue>
					<fpage>766</fpage>
					<lpage>781</lpage>
				</element-citation>
			</ref>
			<ref id="B3">
				<label>3</label>
				<mixed-citation>3. Omer T. The causes of obesity: an in-depth review. Adv Obes Weight Manag Control. 2020; 10(4): 90-94.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Omer</surname>
							<given-names>T</given-names>
						</name>
					</person-group>
					<article-title>The causes of obesity an in-depth review</article-title>
					<source>Adv Obes Weight Manag Control</source>
					<year>2020</year>
					<volume>10</volume>
					<issue>4</issue>
					<fpage>90</fpage>
					<lpage>94</lpage>
				</element-citation>
			</ref>
			<ref id="B4">
				<label>4</label>
				<mixed-citation>4. Aljanabi M, Qutqut MH, Hijjawi M. Machine learning classification techniques for heart disease prediction: a review. Internat J Engineer Technol. 2018; 7(4): 5373-5379.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Aljanabi</surname>
							<given-names>M</given-names>
						</name>
						<name>
							<surname>Qutqut</surname>
							<given-names>MH</given-names>
						</name>
						<name>
							<surname>Hijjawi</surname>
							<given-names>M</given-names>
						</name>
					</person-group>
					<article-title>Machine learning classification techniques for heart disease prediction a review</article-title>
					<source>Internat J Engineer Technol</source>
					<year>2018</year>
					<volume>7</volume>
					<issue>4</issue>
					<fpage>5373</fpage>
					<lpage>5379</lpage>
				</element-citation>
			</ref>
			<ref id="B5">
				<label>5</label>
				<mixed-citation>5. Al-Hashem MA, Alqudah AM, Qananwah Q. Performance evaluation of different machine learning classification algorithms for disease diagnosis. Internat J E-Health Med Communicat. 2021; 12(6):1-28.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Al-Hashem</surname>
							<given-names>MA</given-names>
						</name>
						<name>
							<surname>Alqudah</surname>
							<given-names>AM</given-names>
						</name>
						<name>
							<surname>Qananwah</surname>
							<given-names>Q</given-names>
						</name>
					</person-group>
					<article-title>Performance evaluation of different machine learning classification algorithms for disease diagnosis</article-title>
					<source>Internat J E-Health Med Communicat</source>
					<year>2021</year>
					<volume>12</volume>
					<issue>6</issue>
					<fpage>1</fpage>
					<lpage>28</lpage>
				</element-citation>
			</ref>
			<ref id="B6">
				<label>6</label>
				<mixed-citation>6. An Q, Rahman S, Zhou J, Kang JJ. A comprehensive review on machine learning in healthcare industry: Classification, restrictions, opportunities and challenges. Sensors. 2023; 23(9): 4178.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>An</surname>
							<given-names>Q</given-names>
						</name>
						<name>
							<surname>Rahman</surname>
							<given-names>S</given-names>
						</name>
						<name>
							<surname>Zhou</surname>
							<given-names>J</given-names>
						</name>
						<name>
							<surname>Kang</surname>
							<given-names>JJ</given-names>
						</name>
					</person-group>
					<article-title>A comprehensive review on machine learning in healthcare industry Classification, restrictions, opportunities and challenges</article-title>
					<source>Sensors</source>
					<year>2023</year>
					<volume>23</volume>
					<issue>9</issue>
					<fpage>4178</fpage>
					<lpage>4178</lpage>
				</element-citation>
			</ref>
			<ref id="B7">
				<label>7</label>
				<mixed-citation>7. Safaei M, Sundararajan EA, Driss M, Boulila W, Shapi&amp;apos;I A. A systematic literature review on obesity: Understanding the causes &amp; consequences of obesity and reviewing various machine learning approaches used to predict obesity. Computers Biol Med. 2021; 136: 104754.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Safaei</surname>
							<given-names>M</given-names>
						</name>
						<name>
							<surname>Sundararajan</surname>
							<given-names>EA</given-names>
						</name>
						<name>
							<surname>Driss</surname>
							<given-names>M</given-names>
						</name>
						<name>
							<surname>Boulila</surname>
							<given-names>W</given-names>
						</name>
						<name>
							<surname>Shapi&amp;apos;I</surname>
							<given-names>A</given-names>
						</name>
					</person-group>
					<article-title>A systematic literature review on obesity Understanding the causes &amp; consequences of obesity and reviewing various machine learning approaches used to predict obesity</article-title>
					<source>Computers Biol Med</source>
					<year>2021</year>
					<volume>136</volume>
					<fpage>104754</fpage>
					<lpage>104754</lpage>
				</element-citation>
			</ref>
			<ref id="B8">
				<label>8</label>
				<mixed-citation>8. Ferdowsy F, Alam RKS, Jabiullah I, Habib T. A machine learning approach for obesity risk prediction. Current Res Behavioral Sci. 2021; 2: 100053.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Ferdowsy</surname>
							<given-names>F</given-names>
						</name>
						<name>
							<surname>Alam</surname>
							<given-names>RKS</given-names>
						</name>
						<name>
							<surname>Jabiullah</surname>
							<given-names>I</given-names>
						</name>
						<name>
							<surname>Habib</surname>
							<given-names>T</given-names>
						</name>
					</person-group>
					<article-title>A machine learning approach for obesity risk prediction</article-title>
					<source>Current Res Behavioral Sci</source>
					<year>2021</year>
					<volume>2</volume>
					<fpage>100053</fpage>
					<lpage>100053</lpage>
				</element-citation>
			</ref>
			<ref id="B9">
				<label>9</label>
				<mixed-citation>9. Astuti TS, Sidik AD, Kuswanto H, Lawi A, Nasir S. Predicting obesity in adults using machine learning techniques: an analysis of Indonesian basic health research 2018. Frontiers Nutrition. 2021; 8: 669155.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Astuti</surname>
							<given-names>TS</given-names>
						</name>
						<name>
							<surname>Sidik</surname>
							<given-names>AD</given-names>
						</name>
						<name>
							<surname>Kuswanto</surname>
							<given-names>H</given-names>
						</name>
						<name>
							<surname>Lawi</surname>
							<given-names>A</given-names>
						</name>
						<name>
							<surname>Nasir</surname>
							<given-names>S</given-names>
						</name>
					</person-group>
					<article-title>Predicting obesity in adults using machine learning techniques an analysis of Indonesian basic health research 2018</article-title>
					<source>Frontiers Nutrition</source>
					<year>2021</year>
					<volume>8</volume>
					<fpage>669155</fpage>
					<lpage>669155</lpage>
				</element-citation>
			</ref>
			<ref id="B10">
				<label>10</label>
				<mixed-citation>10. Curbelo MCA, Fergus P, Hussain A, Al-Jumeily D, Abdulaimma B, Hind J, Radi N. Machine learning approaches for the prediction of obesity using publicly available genetic profiles. In 2017 International Joint Conference on Neural Networks (IJCNN), pages 2743-2750. IEEE, 2017.</mixed-citation>
				<element-citation publication-type="confproc">
					<person-group person-group-type="author">
						<name>
							<surname>Curbelo</surname>
							<given-names>MCA</given-names>
						</name>
						<name>
							<surname>Fergus</surname>
							<given-names>P</given-names>
						</name>
						<name>
							<surname>Hussain</surname>
							<given-names>A</given-names>
						</name>
						<name>
							<surname>Al-Jumeily</surname>
							<given-names>D</given-names>
						</name>
						<name>
							<surname>Abdulaimma</surname>
							<given-names>B</given-names>
						</name>
						<name>
							<surname>Hind</surname>
							<given-names>J</given-names>
						</name>
						<name>
							<surname>Radi</surname>
							<given-names>N</given-names>
						</name>
					</person-group>
					<source>Machine learning approaches for the prediction of obesity using publicly available genetic profiles</source>
					<conf-name>2017 International Joint Conference on Neural Networks (IJCNN)</conf-name>
					<publisher-name>IEEE</publisher-name>
					<year>2017</year>
				</element-citation>
			</ref>
			<ref id="B11">
				<label>11</label>
				<mixed-citation>11. Zheng Z, Ruggiero K. Using machine learning to predict obesity in high school students. In 2017 IEEE International Conference on Bioinformatics and Biomedicine (BIBM), pages 2132-2138. IEEE, 2017.</mixed-citation>
				<element-citation publication-type="confproc">
					<person-group person-group-type="author">
						<name>
							<surname>Zheng</surname>
							<given-names>Z</given-names>
						</name>
						<name>
							<surname>Ruggiero</surname>
							<given-names>K</given-names>
						</name>
					</person-group>
					<source>Using machine learning to predict obesity in high school students</source>
					<conf-name>2017 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)</conf-name>
					<publisher-name>IEEE</publisher-name>
					<year>2017</year>
				</element-citation>
			</ref>
			<ref id="B12">
				<label>12</label>
				<mixed-citation>12. Cheng X, Lin S-Y, Liu J, Liu S, Zhang J, Nie P, et al. Does physical activity predict obesity-a machine learning and statistical method-based analysis. Internat J Environm Res Public Health. 2021; 18(8): 3966.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Cheng</surname>
							<given-names>X</given-names>
						</name>
						<name>
							<surname>Lin</surname>
							<given-names>S-Y</given-names>
						</name>
						<name>
							<surname>Liu</surname>
							<given-names>J</given-names>
						</name>
						<name>
							<surname>Liu</surname>
							<given-names>S</given-names>
						</name>
						<name>
							<surname>Zhang</surname>
							<given-names>J</given-names>
						</name>
						<name>
							<surname>Nie</surname>
							<given-names>P</given-names>
						</name>
					</person-group>
					<article-title>Does physical activity predict obesity-a machine learning and statistical method-based analysis</article-title>
					<source>Internat J Environm Res Public Health</source>
					<year>2021</year>
					<volume>18</volume>
					<issue>8</issue>
					<fpage>3966</fpage>
					<lpage>3966</lpage>
				</element-citation>
			</ref>
			<ref id="B13">
				<label>13</label>
				<mixed-citation>13. Chawla NV, Bowyer KW, Hall LO, Kegelmeyer WP. Smote: synthetic minority over-sampling technique. J Artificial Intelligence Research. 2002; 16: 321-357.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Chawla</surname>
							<given-names>NV</given-names>
						</name>
						<name>
							<surname>Bowyer</surname>
							<given-names>KW</given-names>
						</name>
						<name>
							<surname>Hall</surname>
							<given-names>LO</given-names>
						</name>
						<name>
							<surname>Kegelmeyer</surname>
							<given-names>WP</given-names>
						</name>
					</person-group>
					<article-title>Smote synthetic minority over-sampling technique</article-title>
					<source>J Artificial Intelligence Research</source>
					<year>2002</year>
					<volume>16</volume>
					<fpage>321</fpage>
					<lpage>357</lpage>
				</element-citation>
			</ref>
			<ref id="B14">
				<label>14</label>
				<mixed-citation>14. Kosolwattana T, Liu C, Hu R, Han S, Chen H, Lin Y. A self-inspected adaptive smote algorithm (sasmote) for highly imbalanced data classification in healthcare. BioData Mining. 2023; 16(1): 15.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Kosolwattana</surname>
							<given-names>T</given-names>
						</name>
						<name>
							<surname>Liu</surname>
							<given-names>C</given-names>
						</name>
						<name>
							<surname>Hu</surname>
							<given-names>R</given-names>
						</name>
						<name>
							<surname>Han</surname>
							<given-names>S</given-names>
						</name>
						<name>
							<surname>Chen</surname>
							<given-names>H</given-names>
						</name>
						<name>
							<surname>Lin</surname>
							<given-names>Y</given-names>
						</name>
					</person-group>
					<article-title>A self-inspected adaptive smote algorithm (sasmote) for highly imbalanced data classification in healthcare</article-title>
					<source>BioData Mining</source>
					<year>2023</year>
					<volume>16</volume>
					<issue>1</issue>
					<fpage>15</fpage>
					<lpage>15</lpage>
				</element-citation>
			</ref>
			<ref id="B15">
				<label>15</label>
				<mixed-citation>15. Fernández A, Garcia S, Herrera F, Chawla NV. Smote for learning from imbalanced data: progress and challenges, marking the 15-year anniversary. J Artificial Intelligence Res. 2018; 61: 863-905.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Fernández</surname>
							<given-names>A</given-names>
						</name>
						<name>
							<surname>Garcia</surname>
							<given-names>S</given-names>
						</name>
						<name>
							<surname>Herrera</surname>
							<given-names>F</given-names>
						</name>
						<name>
							<surname>Chawla</surname>
							<given-names>NV</given-names>
						</name>
					</person-group>
					<article-title>Smote for learning from imbalanced data progress and challenges, marking the 15-year anniversary</article-title>
					<source>J Artificial Intelligence Res</source>
					<year>2018</year>
					<volume>61</volume>
					<fpage>863</fpage>
					<lpage>905</lpage>
				</element-citation>
			</ref>
			<ref id="B16">
				<label>16</label>
				<mixed-citation>16. Blagus R, Lara L. Smote for high-dimensional class-imbalanced data. BMC bioinformatics. 2013; 14: 1-16.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Blagus</surname>
							<given-names>R</given-names>
						</name>
						<name>
							<surname>Lara</surname>
							<given-names>L</given-names>
						</name>
					</person-group>
					<article-title>Smote for high-dimensional class-imbalanced data</article-title>
					<source>BMC bioinformatics</source>
					<year>2013</year>
					<volume>14</volume>
					<fpage>1</fpage>
					<lpage>16</lpage>
				</element-citation>
			</ref>
			<ref id="B17">
				<label>17</label>
				<mixed-citation>17. Han H, Wang W-Y, Mao B-H. Borderline-smote: a new over-sampling method in imbalanced data sets learning. International conference on intelligent computing, pages 878-887. Springer; 2005.</mixed-citation>
				<element-citation publication-type="confproc">
					<person-group person-group-type="author">
						<name>
							<surname>Han</surname>
							<given-names>H</given-names>
						</name>
						<name>
							<surname>Wang</surname>
							<given-names>W-Y</given-names>
						</name>
						<name>
							<surname>Mao</surname>
							<given-names>B-H</given-names>
						</name>
					</person-group>
					<source>Borderline-smote: a new over-sampling method in imbalanced data sets learning</source>
					<conf-name>International conference on intelligent computing</conf-name>
					<publisher-name>Springer</publisher-name>
					<year>2005</year>
				</element-citation>
			</ref>
			<ref id="B18">
				<label>18</label>
				<mixed-citation>18. Sreejith S, Khanna NH, Kannan A. Clinical data classification using an enhanced smote and chaotic evolutionary feature selection. Computers Biology Medicine. 2020; 126: 103991.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Sreejith</surname>
							<given-names>S</given-names>
						</name>
						<name>
							<surname>Khanna</surname>
							<given-names>NH</given-names>
						</name>
						<name>
							<surname>Kannan</surname>
							<given-names>A</given-names>
						</name>
					</person-group>
					<article-title>Clinical data classification using an enhanced smote and chaotic evolutionary feature selection</article-title>
					<source>Computers Biology Medicine</source>
					<year>2020</year>
					<volume>126</volume>
					<fpage>103991</fpage>
					<lpage>103991</lpage>
				</element-citation>
			</ref>
			<ref id="B19">
				<label>19</label>
				<mixed-citation>19. Elreedy D, Atiya AF. A comprehensive analysis of synthetic minority oversampling technique (smote) for handling class imbalance. Information Sciences. 2019; 505: 32-64.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Elreedy</surname>
							<given-names>D</given-names>
						</name>
						<name>
							<surname>Atiya</surname>
							<given-names>AF</given-names>
						</name>
					</person-group>
					<article-title>A comprehensive analysis of synthetic minority oversampling technique (smote) for handling class imbalance</article-title>
					<source>Information Sciences</source>
					<year>2019</year>
					<volume>505</volume>
					<fpage>32</fpage>
					<lpage>64</lpage>
				</element-citation>
			</ref>
			<ref id="B20">
				<label>20</label>
				<mixed-citation>20. Ismail E, Gad W, Hashem M. A hybrid stacking-smote model for optimizing the prediction of autistic genes. BMC bioinformatics. 2023; 24(1): 379.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Ismail</surname>
							<given-names>E</given-names>
						</name>
						<name>
							<surname>Gad</surname>
							<given-names>W</given-names>
						</name>
						<name>
							<surname>Hashem</surname>
							<given-names>M</given-names>
						</name>
					</person-group>
					<article-title>A hybrid stacking-smote model for optimizing the prediction of autistic genes</article-title>
					<source>BMC bioinformatics</source>
					<year>2023</year>
					<volume>24</volume>
					<issue>1</issue>
					<fpage>379</fpage>
					<lpage>379</lpage>
				</element-citation>
			</ref>
			<ref id="B21">
				<label>21</label>
				<mixed-citation>21. Wang L. Imbalanced credit risk prediction based on smote and multi-kernel FCM improved by particle swarm optimization. Applied Soft Computing. 2022; 114: 108153.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Wang</surname>
							<given-names>L</given-names>
						</name>
					</person-group>
					<article-title>Imbalanced credit risk prediction based on smote and multi-kernel FCM improved by particle swarm optimization</article-title>
					<source>Applied Soft Computing</source>
					<year>2022</year>
					<volume>114</volume>
					<fpage>108153</fpage>
					<lpage>108153</lpage>
				</element-citation>
			</ref>
			<ref id="B22">
				<label>22</label>
				<mixed-citation>22. Yee CPC, Yang Y, Giin LB. Enhancing financial fraud detection through addressing class imbalance using hybrid smote-gan techniques. Internat J Financial Studies. 2023; 11(3): 110.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Yee</surname>
							<given-names>CPC</given-names>
						</name>
						<name>
							<surname>Yang</surname>
							<given-names>Y</given-names>
						</name>
						<name>
							<surname>Giin</surname>
							<given-names>LB</given-names>
						</name>
					</person-group>
					<article-title>Enhancing financial fraud detection through addressing class imbalance using hybrid smote-gan techniques</article-title>
					<source>Internat J Financial Studies</source>
					<year>2023</year>
					<volume>11</volume>
					<issue>3</issue>
					<fpage>110</fpage>
					<lpage>110</lpage>
				</element-citation>
			</ref>
			<ref id="B23">
				<label>23</label>
				<mixed-citation>23. Li H, Liu H, Hu Y. Prediction of unbalanced financial risk based on gra-topsis and smote-cnn. Scientific Programming. 2022; 2022(1): 8074516.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Li</surname>
							<given-names>H</given-names>
						</name>
						<name>
							<surname>Liu</surname>
							<given-names>H</given-names>
						</name>
						<name>
							<surname>Hu</surname>
							<given-names>Y</given-names>
						</name>
					</person-group>
					<article-title>Prediction of unbalanced financial risk based on gra-topsis and smote-cnn</article-title>
					<source>Scientific Programming</source>
					<year>2022</year>
					<volume>2022</volume>
					<issue>1</issue>
					<fpage>8074516</fpage>
					<lpage>8074516</lpage>
				</element-citation>
			</ref>
			<ref id="B24">
				<label>24</label>
				<mixed-citation>24. Özdemir A, Polat K, Alhudhaif A. Classification of imbalanced hyperspectral im- ages using smote-based deep learning methods. Expert Systems Applications. 2021; 178: 114986.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Özdemir</surname>
							<given-names>A</given-names>
						</name>
						<name>
							<surname>Polat</surname>
							<given-names>K</given-names>
						</name>
						<name>
							<surname>Alhudhaif</surname>
							<given-names>A</given-names>
						</name>
					</person-group>
					<article-title>Classification of imbalanced hyperspectral im- ages using smote-based deep learning methods</article-title>
					<source>Expert Systems Applications</source>
					<year>2021</year>
					<volume>178</volume>
					<fpage>114986</fpage>
					<lpage>114986</lpage>
				</element-citation>
			</ref>
			<ref id="B25">
				<label>25</label>
				<mixed-citation>25. Chamseddine E, Mansouri N, Soui M, Abed M. Handling class imbalance in covid-19 chest x-ray images classification: Using smote and weighted loss. Applied Soft Computing. 2022; 129: 109588.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Chamseddine</surname>
							<given-names>E</given-names>
						</name>
						<name>
							<surname>Mansouri</surname>
							<given-names>N</given-names>
						</name>
						<name>
							<surname>Soui</surname>
							<given-names>M</given-names>
						</name>
						<name>
							<surname>Abed</surname>
							<given-names>M</given-names>
						</name>
					</person-group>
					<article-title>Handling class imbalance in covid-19 chest x-ray images classification Using smote and weighted loss</article-title>
					<source>Applied Soft Computing</source>
					<year>2022</year>
					<volume>129</volume>
					<fpage>109588</fpage>
					<lpage>109588</lpage>
				</element-citation>
			</ref>
			<ref id="B26">
				<label>26</label>
				<mixed-citation>26. Sami JA. Heart disease prediction system using (smote technique) balanced dataset and decision tree classifier. AIP Conference Proceedings, volume 2834. AIP Publishing; 2023.</mixed-citation>
				<element-citation publication-type="confproc">
					<person-group person-group-type="author">
						<name>
							<surname>Sami</surname>
							<given-names>JA</given-names>
						</name>
					</person-group>
					<source>Heart disease prediction system using (smote technique) balanced dataset and decision tree classifier</source>
					<conf-name>AIP Conference Proceedings</conf-name>
					<publisher-name>AIP Publishing</publisher-name>
					<year>2023</year>
				</element-citation>
			</ref>
			<ref id="B27">
				<label>27</label>
				<mixed-citation>27. Prasad PS, Sreedevi M. An improved prediction of kidney disease using smote. Indian J Sci Technol. 2016; 9(31): 1-7.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Prasad</surname>
							<given-names>PS</given-names>
						</name>
						<name>
							<surname>Sreedevi</surname>
							<given-names>M</given-names>
						</name>
					</person-group>
					<article-title>An improved prediction of kidney disease using smote</article-title>
					<source>Indian J Sci Technol</source>
					<year>2016</year>
					<volume>9</volume>
					<issue>31</issue>
					<fpage>1</fpage>
					<lpage>7</lpage>
				</element-citation>
			</ref>
			<ref id="B28">
				<label>28</label>
				<mixed-citation>28. Sowjanya AM, Mrudula O. Effective treatment of imbalanced datasets in health care using modified smote coupled with stacked deep learning algorithms. Applied Nanoscience. 2023; 13(3): 1829-1840.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Sowjanya</surname>
							<given-names>AM</given-names>
						</name>
						<name>
							<surname>Mrudula</surname>
							<given-names>O</given-names>
						</name>
					</person-group>
					<article-title>Effective treatment of imbalanced datasets in health care using modified smote coupled with stacked deep learning algorithms</article-title>
					<source>Applied Nanoscience</source>
					<year>2023</year>
					<volume>13</volume>
					<issue>3</issue>
					<fpage>1829</fpage>
					<lpage>1840</lpage>
				</element-citation>
			</ref>
			<ref id="B29">
				<label>29</label>
				<mixed-citation>29. Nasteski V. An overview of the supervised machine learning methods. Horizons b. 2017; 4: 51-62.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Nasteski</surname>
							<given-names>V</given-names>
						</name>
					</person-group>
					<article-title>An overview of the supervised machine learning methods</article-title>
					<source>Horizons b</source>
					<year>2017</year>
					<volume>4</volume>
					<fpage>51</fpage>
					<lpage>62</lpage>
				</element-citation>
			</ref>
			<ref id="B30">
				<label>30</label>
				<mixed-citation>30. Peterson LE. K-nearest neighbor. Scholarpedia. 2009; 4(2): 1883.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Peterson</surname>
							<given-names>LE</given-names>
						</name>
					</person-group>
					<article-title>K-nearest neighbor</article-title>
					<source>Scholarpedia</source>
					<year>2009</year>
					<volume>4</volume>
					<issue>2</issue>
					<fpage>1883</fpage>
					<lpage>1883</lpage>
				</element-citation>
			</ref>
			<ref id="B31">
				<label>31</label>
				<mixed-citation>31. Cunningham P, Delany SJ. k-nearest neighbour classifiers-a tutorial. ACM computing surveys. 2021; 54(6): 1-25.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Cunningham</surname>
							<given-names>P</given-names>
						</name>
						<name>
							<surname>Delany</surname>
							<given-names>SJ</given-names>
						</name>
					</person-group>
					<article-title>k-nearest neighbour classifiers-a tutorial</article-title>
					<source>ACM computing surveys</source>
					<year>2021</year>
					<volume>54</volume>
					<issue>6</issue>
					<fpage>1</fpage>
					<lpage>25</lpage>
				</element-citation>
			</ref>
			<ref id="B32">
				<label>32</label>
				<mixed-citation>32. Rish I. An empirical study of the naive bayes classifier. IJCAI 2001 workshop on empirical methods in artificial intelligence; 2001.</mixed-citation>
				<element-citation publication-type="confproc">
					<person-group person-group-type="author">
						<name>
							<surname>Rish</surname>
							<given-names>I</given-names>
						</name>
					</person-group>
					<source>An empirical study of the naive bayes classifier</source>
					<conf-name>IJCAI 2001 workshop on empirical methods in artificial intelligence</conf-name>
					<year>2001</year>
				</element-citation>
			</ref>
			<ref id="B33">
				<label>33</label>
				<mixed-citation>33. Goodfellow I, Bengio Y, Courville A. Deep learning. MIT press; 2016.</mixed-citation>
				<element-citation publication-type="book">
					<person-group person-group-type="author">
						<name>
							<surname>Goodfellow</surname>
							<given-names>I</given-names>
						</name>
						<name>
							<surname>Bengio</surname>
							<given-names>Y</given-names>
						</name>
						<name>
							<surname>Courville</surname>
							<given-names>A</given-names>
						</name>
					</person-group>
					<source>Deep learning</source>
					<publisher-name>MIT press</publisher-name>
					<year>2016</year>
				</element-citation>
			</ref>
			<ref id="B34">
				<label>34</label>
				<mixed-citation>34. Stevens E, Antiga L, Viehmann T. Deep learning with PyTorch. Manning Publications; 2020.</mixed-citation>
				<element-citation publication-type="book">
					<person-group person-group-type="author">
						<name>
							<surname>Stevens</surname>
							<given-names>E</given-names>
						</name>
						<name>
							<surname>Antiga</surname>
							<given-names>L</given-names>
						</name>
						<name>
							<surname>Viehmann</surname>
							<given-names>T</given-names>
						</name>
					</person-group>
					<source>Deep learning with PyTorch</source>
					<publisher-name>Manning Publications</publisher-name>
					<year>2020</year>
				</element-citation>
			</ref>
			<ref id="B35">
				<label>35</label>
				<mixed-citation>35. Kofi NI, Nyarko-Boateng O, Aning J, et al. Performance of machine learning algorithms with different k values in k-fold crossvalidation. Internat J Information Technol Computer Sci. 2021; 13(6): 61-71.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Kofi</surname>
							<given-names>NI</given-names>
						</name>
						<name>
							<surname>Nyarko-Boateng</surname>
							<given-names>O</given-names>
						</name>
						<name>
							<surname>Aning</surname>
							<given-names>J</given-names>
						</name>
					</person-group>
					<article-title>Performance of machine learning algorithms with different k values in k-fold crossvalidation</article-title>
					<source>Internat J Information Technol Computer Sci</source>
					<year>2021</year>
					<volume>13</volume>
					<issue>6</issue>
					<fpage>61</fpage>
					<lpage>71</lpage>
				</element-citation>
			</ref>
			<ref id="B36">
				<label>36</label>
				<mixed-citation>36. Tamilarasi P, Rani RU. Diagnosis of crime rate against women using k-fold cross validation through machine learning. 2020 fourth international conference on computing methodologies and communication (ICCMC). IEEE; 2020.</mixed-citation>
				<element-citation publication-type="confproc">
					<person-group person-group-type="author">
						<name>
							<surname>Tamilarasi</surname>
							<given-names>P</given-names>
						</name>
						<name>
							<surname>Rani</surname>
							<given-names>RU</given-names>
						</name>
					</person-group>
					<source>Diagnosis of crime rate against women using k-fold cross validation through machine learning</source>
					<conf-name>2020 fourth international conference on computing methodologies and communication (ICCMC)</conf-name>
					<conf-sponsor>IEEE</conf-sponsor>
					<year>2020</year>
				</element-citation>
			</ref>
			<ref id="B37">
				<label>37</label>
				<mixed-citation>37. Misra P, Singh YA. Improving the classification accuracy using recursive feature elimination with cross-validation. Int J Emerg Technol. 2020; 11(3): 659-665.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Misra</surname>
							<given-names>P</given-names>
						</name>
						<name>
							<surname>Singh</surname>
							<given-names>YA</given-names>
						</name>
					</person-group>
					<article-title>Improving the classification accuracy using recursive feature elimination with cross-validation</article-title>
					<source>Int J Emerg Technol</source>
					<year>2020</year>
					<volume>11</volume>
					<issue>3</issue>
					<fpage>659</fpage>
					<lpage>665</lpage>
				</element-citation>
			</ref>
			<ref id="B38">
				<label>38</label>
				<mixed-citation>38. Ram DR, Mukherjee I, Chakraborty C. Obesity disease risk prediction using machine learning. Internat J Data Sci Analytics. 2024. Doi: 0.1007/s41060-023-00491-9.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Ram</surname>
							<given-names>DR</given-names>
						</name>
						<name>
							<surname>Mukherjee</surname>
							<given-names>I</given-names>
						</name>
						<name>
							<surname>Chakraborty</surname>
							<given-names>C</given-names>
						</name>
					</person-group>
					<article-title>Obesity disease risk prediction using machine learning</article-title>
					<source>Internat J Data Sci Analytics</source>
					<year>2024</year>
					<pub-id pub-id-type="doi">10.1007/s41060-023-00491-9</pub-id>
				</element-citation>
			</ref>
			<ref id="B39">
				<label>39</label>
				<mixed-citation>39. Ab MNL, Anuar S. Machine learning modelling for imbalanced dataset: Case study of adolescent obesity in malaysia. J Adv Res Applied Sci Engineer Technol. 2023; 36(1): 189-202.</mixed-citation>
				<element-citation publication-type="journal">
					<person-group person-group-type="author">
						<name>
							<surname>Ab</surname>
							<given-names>MNL</given-names>
						</name>
						<name>
							<surname>Anuar</surname>
							<given-names>S</given-names>
						</name>
					</person-group>
					<article-title>Machine learning modelling for imbalanced dataset Case study of adolescent obesity in malaysia</article-title>
					<source>J Adv Res Applied Sci Engineer Technol</source>
					<year>2023</year>
					<volume>36</volume>
					<issue>1</issue>
					<fpage>189</fpage>
					<lpage>202</lpage>
				</element-citation>
			</ref>
		</ref-list>
		<fn-group>
			<title>Notes:</title>
			<fn fn-type="other" id="fn2">
				<label>Data availability:</label>
				<p> The data and Implementation materials are available upon request</p>
			</fn>
		</fn-group>
	</back>
</article>