datasaur/site/surveyapp/analysis/utils.py

import pandas as pd
from surveyapp import mongo
# from flask import Flask, current_app
from bson.objectid import ObjectId
# For carrying out statistical test
from scipy.stats import chi2_contingency, chisquare
from pingouin import kruskal, mwu
from surveyapp.surveys.utils import parse_data, read_file

import time

# This is a function that will automatically run when the user uploads a file. It will parse the data and
# run some statistical tests based on the type of data in each column. It will not run all tests (for example,
# non-parametric tests will only be run on definite categorical data - i.e. data that is string, object or
# boolean. Ordinal data with numeric values, such as likert scale, will not be tested as this data will be
# identified as numeric.) Furthermore, the results of tests will have to be checked by the user, to check the
# data passes the assumptions of the test.
# Likewise, I do not perform mann whitney U and kruskal wallis on the same variables, even though it is in fact
# possible to perform kruskal wallis on 2 variables. This is to avoid conflicting results and subsequent increasing
# risk of false positives
def run_all_tests(survey_id, user_id, app):
    with app.app_context():
        run_tests(survey_id, user_id)

def run_tests(survey_id, user_id):
    file_obj = mongo.db.surveys.find_one({"_id":ObjectId(survey_id)})
    df = read_file(file_obj["fileName"])
    column_info = parse_data(df)
    test_results = []
    for column_1 in column_info:
        if column_1["data_type"] == "categorical" or column_1["data_type"] == "true/false":
            # Chi square goodness of fit only takes one, non-parametric variable
            p_value, result = chi_goodness(df, column_1["title"])
            if p_value < 0.05:
                test_results.append(result)
            # Now loop through again from the start, checking second variable against the first
            for column_2 in column_info:
                # If the columns are the same then we can contnue with next iteration
                if column_2["title"] == column_1["title"]:
                    continue
                elif column_2["data_type"] == "categorical" or column_2["data_type"] == "true/false":
                    # Chi square needs 2 categorical variables
                    p_value, result = chi_square(df, column_1["title"], column_2["title"])
                    # As Chi square can be done twice (with variable swapping places)
                    # we need to check that it has not yet been done
                    if p_value < 0.05 and not test_done(test_results, result):
                        test_results.append(result)
                elif column_2["data_type"] == "numerical":
                    if column_1["num_unique"] == 2 and column_2["num_unique"] > 1:
                        # We perform mann-whitney U test
                        p_value, result = mann_whitney(df, column_1["title"], column_2["title"])
                    elif column_1["num_unique"] > 2 and column_2["num_unique"] > 1:
                        # We perform kruskal wallis test
                        p_value, result = kruskal_wallis(df, column_1["title"], column_2["title"])
                    if p_value < 0.05:
                        test_results.append(result)
    # Now we can loop through the statistical tests, adding significant ones to
    # a temporary database. This will be presented to the user through a notficiation
    # on the home page.
    for result in test_results:
        mongo.db.temp_results.insert_one({
        "user": user_id,
        "survey_id" : survey_id,
        "result" : result})

# When adding chisquare test of independence, we need to check the test hasn't
# already been carried out (with the variables the opposite way round)
def test_done(previous_results, current_result):
    for result in previous_results:
        if current_result["variable_1"] == result["variable_2"] and current_result["variable_2"] == result["variable_1"]:
            return True
    return False


def kruskal_wallis(df, independent_variable, dependent_variable):
    kruskal_result = kruskal(data=df, dv=dependent_variable, between=independent_variable)
    # get the p-value (p-unc) from the kruskal test and convert to 4 decimal places only
    p_value = float("%.4f" % kruskal_result["p-unc"][0])
    # p_value = kruskal_result["p-unc"][0]
    result = {"test": "Kruskall Wallis Test",
            "p_value": p_value,
            "variable_1": independent_variable,
            "variable_2": dependent_variable,
            "null": f"The distribution of '{dependent_variable}' is the same across groups of '{independent_variable}'",
            "info": """Assumes that dependent variable ('{0}') is ordinal or continuous,
                    that the independent variable ('{1}') consists of more than 2 groups
                    and that these groups follow the same distribution (the shape on a histogram).\n
                    NOTE: It is also possible to perform this test on categories containing just 2 groups,
                    however we have not done so as it could conflict with results from Mann-Whitney U test
                    (performed on categories with 2 groups only).""".format(dependent_variable, independent_variable)}
    return p_value, result


def mann_whitney(df, independent_variable, dependent_variable):
    # Group the data by the independent_variable
    group_by = df.groupby(independent_variable)
    # Convert to an array of groups
    group_array = [group_by.get_group(x) for x in group_by.groups]
    # Get the values of groups 1 and 2 from the array
    x = group_array[0][dependent_variable].values
    y = group_array[1][dependent_variable].values
    keys = list(group_by.groups.keys())
    # Get the distinct keys (we have already checked there are only 2) and save them in variables
    group_1 = keys[0]
    group_2 = keys[1]
    # Perform test
    mwu_result = mwu(x, y)
    # Get the p_value from the result and format to 4 decimals
    p_value = float("%.4f" % mwu_result['p-val'].values[0])
    result = {"test": "Mann-Whitney U Test",
            "p_value": p_value,
            "variable_1": independent_variable,
            "variable_2": dependent_variable,
            "null": f"The distribution of '{dependent_variable}' is the same across groups of '{independent_variable}'",
            "info": """Assumes that the dependent variable ('{0}') is ordinal or continuous,
                    that the independent variable ('{1}') consists of just 2 groups
                    ('{2}' and '{3}') and that these groups follow the same distribution (the shape
                    on a histogram).""".format(dependent_variable, independent_variable, group_1, group_2)}
    return p_value, result


def chi_square(df, variable_1, variable_2):
    # 80% of groups must have a frequency of atleast 5.
    if not five_or_more(df, variable_1) or not five_or_more(df, variable_2):
         # If not, we can return 2, which is an impossible p-value and will be rejected.
        return 2, {}
    contingency_table = pd.crosstab(df[variable_1], df[variable_2])
    _, p_value, _, _ = chi2_contingency(contingency_table, correction=False)
    p_value = float("%.4f" % p_value)
    result = {"test": "Chi-Square test for independence",
            "p_value": p_value,
            "variable_1": variable_1,
            "variable_2": variable_2,
            "null": f"There is no relationship or association between '{variable_1}' and '{variable_2}'",
            "info": """Assumes that both variables are ordinal or nominal,
                    with each variable consisting of 2 or more groups. Also
                    assumes that 80% of the groups contain 5 or more counts."""}
    return p_value, result

# This checks if each category contains groups with at least a frequency of 5 in each group
# (e.g. If 'apple' is a result for 'favourite food' then this function checks if there are at at
# 5 responses with 'apple'). The chi-square independence test requires that 80% of groups contain
# a frequency of 5 or more.
def five_or_more(df, variable):
    group_by = df.groupby(variable)
    # We get the list of unique categories
    keys = list(group_by.groups.keys())
    count_over_5 = 0
    total_count = 0
    for key in keys:
        total_count += 1
        # Get the length (or count) of that category
        key_count = df[df[variable] == key].shape[0]
        if key_count >= 5:
            count_over_5 += 1
    if count_over_5/total_count < 0.8:
        return False
    else:
        return True


def chi_goodness(df, variable):
    # We first group the column by unique categories
    group_by = df.groupby(variable)
    # We get the list of unique categories
    keys = list(group_by.groups.keys())
    actual_distribution = []
    # Loop through each unique category
    for key in keys:
        # Get the length (or count) of that category
        key_count = df[df[variable] == key].shape[0]
        if key_count <= 5:
            # Each group must have a frequency of atleast 5. If not, we can return 2,
            # which is an impossible p-value and will be rejected.
            return 2, {}
        # And add it to our list
        actual_distribution.append(key_count)
    # we will assume expected even distribution and only pass the actual distribution
    _, p_value = chisquare(actual_distribution)
    # Convert to 4 decimal places
    p_value = float("%.4f" % p_value)
    result = {"test": "Chi-Square goodness of fit",
            "p_value": p_value,
            "variable_1": variable,
            "variable_2": "",
            "null": f"Groups of '{variable}' are evenly distributed",
            "info": """Assumes that the expected distribution is even accross groups,
                    that each group is mutually exclusive from the next and each group
                    contains at least 5 subjects."""}
    return p_value, result

# Takes all the tests from the database and writes them to the the excel work sheet
def tests_to_excel(worksheet, tests):
    # Create a table for the data. end of table will be the number of tests +1 for the column headers
    end_of_table = tests.count() + 1
    if end_of_table > 1:
        table_size = "A1:E" + str(end_of_table)
        # Set column headers
        worksheet.add_table(table_size, {'columns': [{'header': "Null Hypothesis"},
                                              {'header': "Statistical Test"},
                                              {'header': "Significance Value"},
                                              {'header': "P-Value"},
                                              {'header': "Conclusion"}]})
    # Row number is 1 since the first row 0 is the header
    row_number = 1
    # Loop through all tests and write them to the worksheet table
    for test in tests:
        if float(test["p"]) < 0.05:
            conclusion = "Reject the null hypothesis."
        else:
            conclusion = "Accept the null hypothesis."
        worksheet.write(row_number, 0, get_null_hypothesis(test["test"], test["independentVariable"], test["dependentVariable"]))
        worksheet.write(row_number, 1, test["test"])
        worksheet.write(row_number, 2, 0.05)
        worksheet.write(row_number, 3, test["p"])
        worksheet.write(row_number, 4, conclusion)
        row_number += 1


# gets the null hypothesis, depending on the type of test
def get_null_hypothesis(test, variable_1, variable_2):
    if test == "Chi-Square goodness of fit":
        return "There is no significant difference between the expected distribution of " + variable_1 + " and the observed distribution."
    elif test == "Chi-Square Test":
        return "There is no association between " + variable_1 + " and " + variable_2
    else:
        return "The distribution of " + variable_1 + " is the same across groups of " + variable_2