222 lines
10 KiB
Python
222 lines
10 KiB
Python
import pandas as pd
|
|
from pandas.api.types import is_string_dtype
|
|
from scipy.stats import chi2_contingency, chisquare
|
|
from pingouin import kruskal, mwu
|
|
from surveyapp import mongo
|
|
from flask import Flask, render_template, url_for, request, Blueprint, flash, redirect, abort, send_file
|
|
from flask_login import login_required, current_user
|
|
from surveyapp.analysis.forms import StatisticalTestForm, ChiGoodnessEntryForm, ChiGoodnessForm
|
|
from surveyapp.surveys.forms import EditForm
|
|
from bson.objectid import ObjectId
|
|
|
|
import tempfile
|
|
from xlsxwriter import Workbook
|
|
|
|
from surveyapp.surveys.utils import parse_data, read_file
|
|
from surveyapp.analysis.utils import tests_to_excel
|
|
|
|
|
|
analysis = Blueprint("analysis", __name__)
|
|
|
|
|
|
# Analyse data sets
|
|
# In this function, after failing validation I have chosen to render the template fresh
|
|
# rather than redirecting the user back to this route. This is so that the form fields
|
|
# remain filled in and the user doesn't have to re-enter their choices.
|
|
@analysis.route("/analyse/<survey_id>", methods=['GET', 'POST'])
|
|
@login_required
|
|
def analyse(survey_id):
|
|
form = StatisticalTestForm()
|
|
survey = mongo.db.surveys.find_one_or_404({"_id": ObjectId(survey_id)})
|
|
if survey["user"] != current_user._id:
|
|
flash("You do not have access to that page", "danger")
|
|
abort(403)
|
|
df = read_file(survey["fileName"])
|
|
# Populate the select options in the form with all the variables
|
|
for variable in list(df.columns.values):
|
|
form.independent_variable.choices.append((variable, variable))
|
|
form.dependent_variable.choices.append((variable, variable))
|
|
if form.validate_on_submit():
|
|
# Get the dataset, and save the variables in python variables
|
|
independent_variable = form.independent_variable.data
|
|
dependent_variable = form.dependent_variable.data
|
|
# Ensure the user hasn't selected the same variable for both
|
|
if independent_variable == dependent_variable:
|
|
flash("You can't select the same variable for both.", "danger")
|
|
return render_template("analysis/analysedata.html", form=form)
|
|
test = form.test.data
|
|
# If the user selects Chi-Square goodness fit then they are redirected to a separate URL
|
|
if test == "Chi-Square goodness of fit":
|
|
return redirect(url_for('analysis.chi_goodness', variable=independent_variable, survey_id=survey_id))
|
|
# The other tests all require a dependent variable
|
|
if dependent_variable == "":
|
|
flash("You must select a dependent variable for this test.", "danger")
|
|
return render_template("analysis/analysedata.html", form=form)
|
|
if test == "Kruskall Wallis Test":
|
|
if is_string_dtype(df[dependent_variable]):
|
|
flash("Dependent Variable '" + dependent_variable + "' is not numeric.", "danger")
|
|
return render_template("analysis/analysedata.html", form=form)
|
|
kruskal_result = kruskal(data=df, dv=dependent_variable, between=independent_variable)
|
|
# get the p-value (p-unc) from the kruskal test and convert to 4 decimal places only
|
|
p_value = "%.4f" % kruskal_result["p-unc"][0]
|
|
# AT THE MOMENT, THIS TEST IS 2 TAILED. MAY WANT TO ADD OPTIONS FOR 1 TAILED TESTS
|
|
elif test == "Mann-Whitney U Test":
|
|
if is_string_dtype(df[dependent_variable]):
|
|
flash("Dependent Variable '" + dependent_variable + "' is not numeric.", "danger")
|
|
return render_template("analysis/analysedata.html", form=form)
|
|
group_by = df.groupby(independent_variable)
|
|
group_array = [group_by.get_group(x) for x in group_by.groups]
|
|
if len(group_array) != 2:
|
|
flash("Independent variable '" + independent_variable + "' has too many groups, only 2 allowed for Mann-Whitney U Test.", "danger")
|
|
return render_template("analysis/analysedata.html", form=form)
|
|
x = group_array[0][dependent_variable].values
|
|
y = group_array[1][dependent_variable].values
|
|
mwu_result = mwu(x, y)
|
|
p_value = "%.4f" % mwu_result['p-val'].values[0]
|
|
elif test == "Chi-Square Test":
|
|
contingency_table = pd.crosstab(df[independent_variable], df[dependent_variable])
|
|
_, p_value, _, _ = chi2_contingency(contingency_table, correction=False)
|
|
|
|
return redirect(url_for('analysis.result',
|
|
survey=survey_id,
|
|
test=test,
|
|
p_value=p_value,
|
|
independent_variable=independent_variable,
|
|
dependent_variable=dependent_variable))
|
|
return render_template("analysis/analysedata.html", form=form)
|
|
|
|
|
|
# Chi goodness of fit - extra form for expected values
|
|
@analysis.route("/chi/<survey_id>/<variable>", methods=['GET', 'POST'])
|
|
@login_required
|
|
def chi_goodness(survey_id, variable):
|
|
# Get survey object and datafram
|
|
survey = mongo.db.surveys.find_one_or_404({"_id": ObjectId(survey_id)})
|
|
df = read_file(survey["fileName"])
|
|
group_by = df.groupby(variable)
|
|
keys = list(group_by.groups.keys())
|
|
# Populate the form with unique groups in the given variable
|
|
key_list = []
|
|
# Get the total count, so that we can check the expected distribution matches
|
|
total_count = len(df.index)
|
|
# Populate the keys objects, initialising "expected" to 0
|
|
for key in keys:
|
|
key_list.append({"expected": 0, "key": key})
|
|
form = ChiGoodnessForm(field=key_list)
|
|
if form.validate_on_submit():
|
|
# Initialise lists for actual and expected ditributions in the data
|
|
actual_distribution = []
|
|
expected_distribution = []
|
|
for key in keys:
|
|
# For each group, we get the count in the data and append it to our list
|
|
key_count = df[df[variable] == key].shape[0]
|
|
actual_distribution.append(key_count)
|
|
for input in form.field.data:
|
|
if key == input['key']:
|
|
# Now we populate the expected count from the form data
|
|
expected_distribution.append(input['expected'])
|
|
if sum(expected_distribution) == 0:
|
|
_, p_value = chisquare(actual_distribution)
|
|
else:
|
|
_, p_value = chisquare(actual_distribution, expected_distribution)
|
|
return redirect(url_for('analysis.result',
|
|
survey=survey_id,
|
|
test="Chi-Square goodness of fit",
|
|
p_value=p_value,
|
|
independent_variable=variable,))
|
|
return render_template("analysis/chisquare.html", form=form, keys=keys, total=total_count)
|
|
|
|
|
|
|
|
|
|
# Results from stats test
|
|
@analysis.route("/result", methods=['GET', 'POST'])
|
|
@login_required
|
|
def result():
|
|
form = EditForm()
|
|
# Set a default alpha value 0.05 to compare the p value to
|
|
alpha=0.05
|
|
# cast string to float so it can be compared with the alpha value
|
|
p_value=float(request.args.get("p_value"))
|
|
test=request.args.get("test")
|
|
independent_variable=request.args.get("independent_variable")
|
|
dependent_variable=request.args.get("dependent_variable")
|
|
# Chi goodness does not have a dependent_variable
|
|
if not dependent_variable:
|
|
dependent_variable = ""
|
|
# Get the survey variable so the test result can be saved and reference the survey
|
|
survey_id=request.args.get("survey")
|
|
test_id=request.args.get("test_id")
|
|
if form.validate_on_submit():
|
|
# 'upsert' creates entry if it does not yet exist
|
|
mongo.db.tests.update_one({"_id": ObjectId(test_id)},
|
|
{"$set":{"surveyId" : survey_id,
|
|
"user" : current_user._id,
|
|
"title" : form.title.data,
|
|
"test" : test,
|
|
"independentVariable" : independent_variable,
|
|
"dependentVariable" : dependent_variable,
|
|
"p" : p_value}}, upsert=True)
|
|
flash("Statistical test saved.", "success")
|
|
return redirect(url_for('surveys.dashboard', title="Dashboard", survey_id=survey_id))
|
|
title=request.args.get("title")
|
|
if title:
|
|
# i.e. if test already exists and user is clicking to view/edit it
|
|
form.title.data = title
|
|
else:
|
|
# Set the default title. Users can change this
|
|
form.title.data = independent_variable + "/" + dependent_variable + ": " + test
|
|
result = {"test":test, "p":p_value, "alpha":alpha, "iv":independent_variable, "dv":dependent_variable}
|
|
return render_template("analysis/result.html", result=result, form=form, survey_id=survey_id)
|
|
|
|
|
|
|
|
# DELETE A statistical test
|
|
@analysis.route("/analyse/<survey_id>/<test_id>/delete", methods=['POST'])
|
|
@login_required
|
|
def delete_test(survey_id, test_id):
|
|
test_obj = mongo.db.tests.find_one_or_404({"_id":ObjectId(test_id)})
|
|
if test_obj["user"] != current_user._id:
|
|
flash("You do not have access to that page", "danger")
|
|
abort(403)
|
|
mongo.db.tests.delete_one(test_obj)
|
|
flash("Test deleted", "success")
|
|
return redirect(url_for('surveys.dashboard', survey_id=survey_id))
|
|
|
|
|
|
# Give the user a quick overview of stats on the survey data
|
|
@analysis.route("/quickstats/<survey_id>", methods=['GET'])
|
|
@login_required
|
|
def quick_stats(survey_id):
|
|
file_obj = mongo.db.surveys.find_one_or_404({"_id":ObjectId(survey_id)})
|
|
if file_obj["user"] != current_user._id:
|
|
flash("You do not have access to that page", "danger")
|
|
abort(403)
|
|
df = read_file(file_obj["fileName"])
|
|
rows = len(df.index)
|
|
cols = len(df.columns)
|
|
column_info = parse_data(df);
|
|
return render_template("analysis/quickstats.html", rows=rows, cols=cols, column_info=column_info, survey_id=survey_id, survey_title=file_obj["title"] )
|
|
|
|
# Give the user a quick overview of stats on the survey data
|
|
@analysis.route("/export_tests/<survey_id>", methods=['GET'])
|
|
@login_required
|
|
def export_tests(survey_id):
|
|
file_obj = mongo.db.surveys.find_one_or_404({"_id":ObjectId(survey_id)})
|
|
if file_obj["user"] != current_user._id:
|
|
flash("You do not have access to that survey", "danger")
|
|
abort(403)
|
|
tests = mongo.db.tests.find({"surveyId":survey_id})
|
|
if tests.count() == 0:
|
|
flash("You do not yet have any statistical tests for this survey!", "danger")
|
|
return redirect(url_for('surveys.dashboard', survey_id=survey_id))
|
|
# Use a temp file so that it can be deleted after
|
|
with tempfile.NamedTemporaryFile() as f:
|
|
# Create a new excel workbook
|
|
wb = Workbook(f.name)
|
|
# grab the active worksheet
|
|
ws = wb.add_worksheet("Statistical tests")
|
|
tests_to_excel(ws, tests)
|
|
wb.close()
|
|
return send_file(f.name, attachment_filename=file_obj["title"] + ".xlsx", as_attachment=True)
|