datasaur/site/surveyapp/analysis/utils.py
2026-01-25 15:56:01 +00:00

231 lines
12 KiB
Python

import pandas as pd
from surveyapp import mongo
# from flask import Flask, current_app
from bson.objectid import ObjectId
# For carrying out statistical test
from scipy.stats import chi2_contingency, chisquare
from pingouin import kruskal, mwu
from surveyapp.surveys.utils import parse_data, read_file
import time
# This is a function that will automatically run when the user uploads a file. It will parse the data and
# run some statistical tests based on the type of data in each column. It will not run all tests (for example,
# non-parametric tests will only be run on definite categorical data - i.e. data that is string, object or
# boolean. Ordinal data with numeric values, such as likert scale, will not be tested as this data will be
# identified as numeric.) Furthermore, the results of tests will have to be checked by the user, to check the
# data passes the assumptions of the test.
# Likewise, I do not perform mann whitney U and kruskal wallis on the same variables, even though it is in fact
# possible to perform kruskal wallis on 2 variables. This is to avoid conflicting results and subsequent increasing
# risk of false positives
def run_all_tests(survey_id, user_id, app):
with app.app_context():
run_tests(survey_id, user_id)
def run_tests(survey_id, user_id):
file_obj = mongo.db.surveys.find_one({"_id":ObjectId(survey_id)})
df = read_file(file_obj["fileName"])
column_info = parse_data(df)
test_results = []
for column_1 in column_info:
if column_1["data_type"] == "categorical" or column_1["data_type"] == "true/false":
# Chi square goodness of fit only takes one, non-parametric variable
p_value, result = chi_goodness(df, column_1["title"])
if p_value < 0.05:
test_results.append(result)
# Now loop through again from the start, checking second variable against the first
for column_2 in column_info:
# If the columns are the same then we can contnue with next iteration
if column_2["title"] == column_1["title"]:
continue
elif column_2["data_type"] == "categorical" or column_2["data_type"] == "true/false":
# Chi square needs 2 categorical variables
p_value, result = chi_square(df, column_1["title"], column_2["title"])
# As Chi square can be done twice (with variable swapping places)
# we need to check that it has not yet been done
if p_value < 0.05 and not test_done(test_results, result):
test_results.append(result)
elif column_2["data_type"] == "numerical":
if column_1["num_unique"] == 2 and column_2["num_unique"] > 1:
# We perform mann-whitney U test
p_value, result = mann_whitney(df, column_1["title"], column_2["title"])
elif column_1["num_unique"] > 2 and column_2["num_unique"] > 1:
# We perform kruskal wallis test
p_value, result = kruskal_wallis(df, column_1["title"], column_2["title"])
if p_value < 0.05:
test_results.append(result)
# Now we can loop through the statistical tests, adding significant ones to
# a temporary database. This will be presented to the user through a notficiation
# on the home page.
for result in test_results:
mongo.db.temp_results.insert_one({
"user": user_id,
"survey_id" : survey_id,
"result" : result})
# When adding chisquare test of independence, we need to check the test hasn't
# already been carried out (with the variables the opposite way round)
def test_done(previous_results, current_result):
for result in previous_results:
if current_result["variable_1"] == result["variable_2"] and current_result["variable_2"] == result["variable_1"]:
return True
return False
def kruskal_wallis(df, independent_variable, dependent_variable):
kruskal_result = kruskal(data=df, dv=dependent_variable, between=independent_variable)
# get the p-value (p-unc) from the kruskal test and convert to 4 decimal places only
p_value = float("%.4f" % kruskal_result["p-unc"][0])
# p_value = kruskal_result["p-unc"][0]
result = {"test": "Kruskall Wallis Test",
"p_value": p_value,
"variable_1": independent_variable,
"variable_2": dependent_variable,
"null": f"The distribution of '{dependent_variable}' is the same across groups of '{independent_variable}'",
"info": """Assumes that dependent variable ('{0}') is ordinal or continuous,
that the independent variable ('{1}') consists of more than 2 groups
and that these groups follow the same distribution (the shape on a histogram).\n
NOTE: It is also possible to perform this test on categories containing just 2 groups,
however we have not done so as it could conflict with results from Mann-Whitney U test
(performed on categories with 2 groups only).""".format(dependent_variable, independent_variable)}
return p_value, result
def mann_whitney(df, independent_variable, dependent_variable):
# Group the data by the independent_variable
group_by = df.groupby(independent_variable)
# Convert to an array of groups
group_array = [group_by.get_group(x) for x in group_by.groups]
# Get the values of groups 1 and 2 from the array
x = group_array[0][dependent_variable].values
y = group_array[1][dependent_variable].values
keys = list(group_by.groups.keys())
# Get the distinct keys (we have already checked there are only 2) and save them in variables
group_1 = keys[0]
group_2 = keys[1]
# Perform test
mwu_result = mwu(x, y)
# Get the p_value from the result and format to 4 decimals
p_value = float("%.4f" % mwu_result['p-val'].values[0])
result = {"test": "Mann-Whitney U Test",
"p_value": p_value,
"variable_1": independent_variable,
"variable_2": dependent_variable,
"null": f"The distribution of '{dependent_variable}' is the same across groups of '{independent_variable}'",
"info": """Assumes that the dependent variable ('{0}') is ordinal or continuous,
that the independent variable ('{1}') consists of just 2 groups
('{2}' and '{3}') and that these groups follow the same distribution (the shape
on a histogram).""".format(dependent_variable, independent_variable, group_1, group_2)}
return p_value, result
def chi_square(df, variable_1, variable_2):
# 80% of groups must have a frequency of atleast 5.
if not five_or_more(df, variable_1) or not five_or_more(df, variable_2):
# If not, we can return 2, which is an impossible p-value and will be rejected.
return 2, {}
contingency_table = pd.crosstab(df[variable_1], df[variable_2])
_, p_value, _, _ = chi2_contingency(contingency_table, correction=False)
p_value = float("%.4f" % p_value)
result = {"test": "Chi-Square test for independence",
"p_value": p_value,
"variable_1": variable_1,
"variable_2": variable_2,
"null": f"There is no relationship or association between '{variable_1}' and '{variable_2}'",
"info": """Assumes that both variables are ordinal or nominal,
with each variable consisting of 2 or more groups. Also
assumes that 80% of the groups contain 5 or more counts."""}
return p_value, result
# This checks if each category contains groups with at least a frequency of 5 in each group
# (e.g. If 'apple' is a result for 'favourite food' then this function checks if there are at at
# 5 responses with 'apple'). The chi-square independence test requires that 80% of groups contain
# a frequency of 5 or more.
def five_or_more(df, variable):
group_by = df.groupby(variable)
# We get the list of unique categories
keys = list(group_by.groups.keys())
count_over_5 = 0
total_count = 0
for key in keys:
total_count += 1
# Get the length (or count) of that category
key_count = df[df[variable] == key].shape[0]
if key_count >= 5:
count_over_5 += 1
if count_over_5/total_count < 0.8:
return False
else:
return True
def chi_goodness(df, variable):
# We first group the column by unique categories
group_by = df.groupby(variable)
# We get the list of unique categories
keys = list(group_by.groups.keys())
actual_distribution = []
# Loop through each unique category
for key in keys:
# Get the length (or count) of that category
key_count = df[df[variable] == key].shape[0]
if key_count <= 5:
# Each group must have a frequency of atleast 5. If not, we can return 2,
# which is an impossible p-value and will be rejected.
return 2, {}
# And add it to our list
actual_distribution.append(key_count)
# we will assume expected even distribution and only pass the actual distribution
_, p_value = chisquare(actual_distribution)
# Convert to 4 decimal places
p_value = float("%.4f" % p_value)
result = {"test": "Chi-Square goodness of fit",
"p_value": p_value,
"variable_1": variable,
"variable_2": "",
"null": f"Groups of '{variable}' are evenly distributed",
"info": """Assumes that the expected distribution is even accross groups,
that each group is mutually exclusive from the next and each group
contains at least 5 subjects."""}
return p_value, result
# Takes all the tests from the database and writes them to the the excel work sheet
def tests_to_excel(worksheet, tests):
# Create a table for the data. end of table will be the number of tests +1 for the column headers
end_of_table = tests.count() + 1
if end_of_table > 1:
table_size = "A1:E" + str(end_of_table)
# Set column headers
worksheet.add_table(table_size, {'columns': [{'header': "Null Hypothesis"},
{'header': "Statistical Test"},
{'header': "Significance Value"},
{'header': "P-Value"},
{'header': "Conclusion"}]})
# Row number is 1 since the first row 0 is the header
row_number = 1
# Loop through all tests and write them to the worksheet table
for test in tests:
if float(test["p"]) < 0.05:
conclusion = "Reject the null hypothesis."
else:
conclusion = "Accept the null hypothesis."
worksheet.write(row_number, 0, get_null_hypothesis(test["test"], test["independentVariable"], test["dependentVariable"]))
worksheet.write(row_number, 1, test["test"])
worksheet.write(row_number, 2, 0.05)
worksheet.write(row_number, 3, test["p"])
worksheet.write(row_number, 4, conclusion)
row_number += 1
# gets the null hypothesis, depending on the type of test
def get_null_hypothesis(test, variable_1, variable_2):
if test == "Chi-Square goodness of fit":
return "There is no significant difference between the expected distribution of " + variable_1 + " and the observed distribution."
elif test == "Chi-Square Test":
return "There is no association between " + variable_1 + " and " + variable_2
else:
return "The distribution of " + variable_1 + " is the same across groups of " + variable_2