import pandas as pd from surveyapp import mongo # from flask import Flask, current_app from bson.objectid import ObjectId # For carrying out statistical test from scipy.stats import chi2_contingency, chisquare from pingouin import kruskal, mwu from surveyapp.surveys.utils import parse_data, read_file import time # This is a function that will automatically run when the user uploads a file. It will parse the data and # run some statistical tests based on the type of data in each column. It will not run all tests (for example, # non-parametric tests will only be run on definite categorical data - i.e. data that is string, object or # boolean. Ordinal data with numeric values, such as likert scale, will not be tested as this data will be # identified as numeric.) Furthermore, the results of tests will have to be checked by the user, to check the # data passes the assumptions of the test. # Likewise, I do not perform mann whitney U and kruskal wallis on the same variables, even though it is in fact # possible to perform kruskal wallis on 2 variables. This is to avoid conflicting results and subsequent increasing # risk of false positives def run_all_tests(survey_id, user_id, app): with app.app_context(): run_tests(survey_id, user_id) def run_tests(survey_id, user_id): file_obj = mongo.db.surveys.find_one({"_id":ObjectId(survey_id)}) df = read_file(file_obj["fileName"]) column_info = parse_data(df) test_results = [] for column_1 in column_info: if column_1["data_type"] == "categorical" or column_1["data_type"] == "true/false": # Chi square goodness of fit only takes one, non-parametric variable p_value, result = chi_goodness(df, column_1["title"]) if p_value < 0.05: test_results.append(result) # Now loop through again from the start, checking second variable against the first for column_2 in column_info: # If the columns are the same then we can contnue with next iteration if column_2["title"] == column_1["title"]: continue elif column_2["data_type"] == "categorical" or column_2["data_type"] == "true/false": # Chi square needs 2 categorical variables p_value, result = chi_square(df, column_1["title"], column_2["title"]) # As Chi square can be done twice (with variable swapping places) # we need to check that it has not yet been done if p_value < 0.05 and not test_done(test_results, result): test_results.append(result) elif column_2["data_type"] == "numerical": if column_1["num_unique"] == 2 and column_2["num_unique"] > 1: # We perform mann-whitney U test p_value, result = mann_whitney(df, column_1["title"], column_2["title"]) elif column_1["num_unique"] > 2 and column_2["num_unique"] > 1: # We perform kruskal wallis test p_value, result = kruskal_wallis(df, column_1["title"], column_2["title"]) if p_value < 0.05: test_results.append(result) # Now we can loop through the statistical tests, adding significant ones to # a temporary database. This will be presented to the user through a notficiation # on the home page. for result in test_results: mongo.db.temp_results.insert_one({ "user": user_id, "survey_id" : survey_id, "result" : result}) # When adding chisquare test of independence, we need to check the test hasn't # already been carried out (with the variables the opposite way round) def test_done(previous_results, current_result): for result in previous_results: if current_result["variable_1"] == result["variable_2"] and current_result["variable_2"] == result["variable_1"]: return True return False def kruskal_wallis(df, independent_variable, dependent_variable): kruskal_result = kruskal(data=df, dv=dependent_variable, between=independent_variable) # get the p-value (p-unc) from the kruskal test and convert to 4 decimal places only p_value = float("%.4f" % kruskal_result["p-unc"][0]) # p_value = kruskal_result["p-unc"][0] result = {"test": "Kruskall Wallis Test", "p_value": p_value, "variable_1": independent_variable, "variable_2": dependent_variable, "null": f"The distribution of '{dependent_variable}' is the same across groups of '{independent_variable}'", "info": """Assumes that dependent variable ('{0}') is ordinal or continuous, that the independent variable ('{1}') consists of more than 2 groups and that these groups follow the same distribution (the shape on a histogram).\n NOTE: It is also possible to perform this test on categories containing just 2 groups, however we have not done so as it could conflict with results from Mann-Whitney U test (performed on categories with 2 groups only).""".format(dependent_variable, independent_variable)} return p_value, result def mann_whitney(df, independent_variable, dependent_variable): # Group the data by the independent_variable group_by = df.groupby(independent_variable) # Convert to an array of groups group_array = [group_by.get_group(x) for x in group_by.groups] # Get the values of groups 1 and 2 from the array x = group_array[0][dependent_variable].values y = group_array[1][dependent_variable].values keys = list(group_by.groups.keys()) # Get the distinct keys (we have already checked there are only 2) and save them in variables group_1 = keys[0] group_2 = keys[1] # Perform test mwu_result = mwu(x, y) # Get the p_value from the result and format to 4 decimals p_value = float("%.4f" % mwu_result['p-val'].values[0]) result = {"test": "Mann-Whitney U Test", "p_value": p_value, "variable_1": independent_variable, "variable_2": dependent_variable, "null": f"The distribution of '{dependent_variable}' is the same across groups of '{independent_variable}'", "info": """Assumes that the dependent variable ('{0}') is ordinal or continuous, that the independent variable ('{1}') consists of just 2 groups ('{2}' and '{3}') and that these groups follow the same distribution (the shape on a histogram).""".format(dependent_variable, independent_variable, group_1, group_2)} return p_value, result def chi_square(df, variable_1, variable_2): # 80% of groups must have a frequency of atleast 5. if not five_or_more(df, variable_1) or not five_or_more(df, variable_2): # If not, we can return 2, which is an impossible p-value and will be rejected. return 2, {} contingency_table = pd.crosstab(df[variable_1], df[variable_2]) _, p_value, _, _ = chi2_contingency(contingency_table, correction=False) p_value = float("%.4f" % p_value) result = {"test": "Chi-Square test for independence", "p_value": p_value, "variable_1": variable_1, "variable_2": variable_2, "null": f"There is no relationship or association between '{variable_1}' and '{variable_2}'", "info": """Assumes that both variables are ordinal or nominal, with each variable consisting of 2 or more groups. Also assumes that 80% of the groups contain 5 or more counts."""} return p_value, result # This checks if each category contains groups with at least a frequency of 5 in each group # (e.g. If 'apple' is a result for 'favourite food' then this function checks if there are at at # 5 responses with 'apple'). The chi-square independence test requires that 80% of groups contain # a frequency of 5 or more. def five_or_more(df, variable): group_by = df.groupby(variable) # We get the list of unique categories keys = list(group_by.groups.keys()) count_over_5 = 0 total_count = 0 for key in keys: total_count += 1 # Get the length (or count) of that category key_count = df[df[variable] == key].shape[0] if key_count >= 5: count_over_5 += 1 if count_over_5/total_count < 0.8: return False else: return True def chi_goodness(df, variable): # We first group the column by unique categories group_by = df.groupby(variable) # We get the list of unique categories keys = list(group_by.groups.keys()) actual_distribution = [] # Loop through each unique category for key in keys: # Get the length (or count) of that category key_count = df[df[variable] == key].shape[0] if key_count <= 5: # Each group must have a frequency of atleast 5. If not, we can return 2, # which is an impossible p-value and will be rejected. return 2, {} # And add it to our list actual_distribution.append(key_count) # we will assume expected even distribution and only pass the actual distribution _, p_value = chisquare(actual_distribution) # Convert to 4 decimal places p_value = float("%.4f" % p_value) result = {"test": "Chi-Square goodness of fit", "p_value": p_value, "variable_1": variable, "variable_2": "", "null": f"Groups of '{variable}' are evenly distributed", "info": """Assumes that the expected distribution is even accross groups, that each group is mutually exclusive from the next and each group contains at least 5 subjects."""} return p_value, result # Takes all the tests from the database and writes them to the the excel work sheet def tests_to_excel(worksheet, tests): # Create a table for the data. end of table will be the number of tests +1 for the column headers end_of_table = tests.count() + 1 if end_of_table > 1: table_size = "A1:E" + str(end_of_table) # Set column headers worksheet.add_table(table_size, {'columns': [{'header': "Null Hypothesis"}, {'header': "Statistical Test"}, {'header': "Significance Value"}, {'header': "P-Value"}, {'header': "Conclusion"}]}) # Row number is 1 since the first row 0 is the header row_number = 1 # Loop through all tests and write them to the worksheet table for test in tests: if float(test["p"]) < 0.05: conclusion = "Reject the null hypothesis." else: conclusion = "Accept the null hypothesis." worksheet.write(row_number, 0, get_null_hypothesis(test["test"], test["independentVariable"], test["dependentVariable"])) worksheet.write(row_number, 1, test["test"]) worksheet.write(row_number, 2, 0.05) worksheet.write(row_number, 3, test["p"]) worksheet.write(row_number, 4, conclusion) row_number += 1 # gets the null hypothesis, depending on the type of test def get_null_hypothesis(test, variable_1, variable_2): if test == "Chi-Square goodness of fit": return "There is no significant difference between the expected distribution of " + variable_1 + " and the observed distribution." elif test == "Chi-Square Test": return "There is no association between " + variable_1 + " and " + variable_2 else: return "The distribution of " + variable_1 + " is the same across groups of " + variable_2