import pandas as pd from pandas.api.types import is_string_dtype from scipy.stats import chi2_contingency, chisquare from pingouin import kruskal, mwu from surveyapp import mongo from flask import Flask, render_template, url_for, request, Blueprint, flash, redirect, abort, send_file from flask_login import login_required, current_user from surveyapp.analysis.forms import StatisticalTestForm, ChiGoodnessEntryForm, ChiGoodnessForm from surveyapp.surveys.forms import EditForm from bson.objectid import ObjectId import tempfile from xlsxwriter import Workbook from surveyapp.surveys.utils import parse_data, read_file from surveyapp.analysis.utils import tests_to_excel analysis = Blueprint("analysis", __name__) # Analyse data sets # In this function, after failing validation I have chosen to render the template fresh # rather than redirecting the user back to this route. This is so that the form fields # remain filled in and the user doesn't have to re-enter their choices. @analysis.route("/analyse/", methods=['GET', 'POST']) @login_required def analyse(survey_id): form = StatisticalTestForm() survey = mongo.db.surveys.find_one_or_404({"_id": ObjectId(survey_id)}) if survey["user"] != current_user._id: flash("You do not have access to that page", "danger") abort(403) df = read_file(survey["fileName"]) # Populate the select options in the form with all the variables for variable in list(df.columns.values): form.independent_variable.choices.append((variable, variable)) form.dependent_variable.choices.append((variable, variable)) if form.validate_on_submit(): # Get the dataset, and save the variables in python variables independent_variable = form.independent_variable.data dependent_variable = form.dependent_variable.data # Ensure the user hasn't selected the same variable for both if independent_variable == dependent_variable: flash("You can't select the same variable for both.", "danger") return render_template("analysis/analysedata.html", form=form) test = form.test.data # If the user selects Chi-Square goodness fit then they are redirected to a separate URL if test == "Chi-Square goodness of fit": return redirect(url_for('analysis.chi_goodness', variable=independent_variable, survey_id=survey_id)) # The other tests all require a dependent variable if dependent_variable == "": flash("You must select a dependent variable for this test.", "danger") return render_template("analysis/analysedata.html", form=form) if test == "Kruskall Wallis Test": if is_string_dtype(df[dependent_variable]): flash("Dependent Variable '" + dependent_variable + "' is not numeric.", "danger") return render_template("analysis/analysedata.html", form=form) kruskal_result = kruskal(data=df, dv=dependent_variable, between=independent_variable) # get the p-value (p-unc) from the kruskal test and convert to 4 decimal places only p_value = "%.4f" % kruskal_result["p-unc"][0] # AT THE MOMENT, THIS TEST IS 2 TAILED. MAY WANT TO ADD OPTIONS FOR 1 TAILED TESTS elif test == "Mann-Whitney U Test": if is_string_dtype(df[dependent_variable]): flash("Dependent Variable '" + dependent_variable + "' is not numeric.", "danger") return render_template("analysis/analysedata.html", form=form) group_by = df.groupby(independent_variable) group_array = [group_by.get_group(x) for x in group_by.groups] if len(group_array) != 2: flash("Independent variable '" + independent_variable + "' has too many groups, only 2 allowed for Mann-Whitney U Test.", "danger") return render_template("analysis/analysedata.html", form=form) x = group_array[0][dependent_variable].values y = group_array[1][dependent_variable].values mwu_result = mwu(x, y) p_value = "%.4f" % mwu_result['p-val'].values[0] elif test == "Chi-Square Test": contingency_table = pd.crosstab(df[independent_variable], df[dependent_variable]) _, p_value, _, _ = chi2_contingency(contingency_table, correction=False) return redirect(url_for('analysis.result', survey=survey_id, test=test, p_value=p_value, independent_variable=independent_variable, dependent_variable=dependent_variable)) return render_template("analysis/analysedata.html", form=form) # Chi goodness of fit - extra form for expected values @analysis.route("/chi//", methods=['GET', 'POST']) @login_required def chi_goodness(survey_id, variable): # Get survey object and datafram survey = mongo.db.surveys.find_one_or_404({"_id": ObjectId(survey_id)}) df = read_file(survey["fileName"]) group_by = df.groupby(variable) keys = list(group_by.groups.keys()) # Populate the form with unique groups in the given variable key_list = [] # Get the total count, so that we can check the expected distribution matches total_count = len(df.index) # Populate the keys objects, initialising "expected" to 0 for key in keys: key_list.append({"expected": 0, "key": key}) form = ChiGoodnessForm(field=key_list) if form.validate_on_submit(): # Initialise lists for actual and expected ditributions in the data actual_distribution = [] expected_distribution = [] for key in keys: # For each group, we get the count in the data and append it to our list key_count = df[df[variable] == key].shape[0] actual_distribution.append(key_count) for input in form.field.data: if key == input['key']: # Now we populate the expected count from the form data expected_distribution.append(input['expected']) if sum(expected_distribution) == 0: _, p_value = chisquare(actual_distribution) else: _, p_value = chisquare(actual_distribution, expected_distribution) return redirect(url_for('analysis.result', survey=survey_id, test="Chi-Square goodness of fit", p_value=p_value, independent_variable=variable,)) return render_template("analysis/chisquare.html", form=form, keys=keys, total=total_count) # Results from stats test @analysis.route("/result", methods=['GET', 'POST']) @login_required def result(): form = EditForm() # Set a default alpha value 0.05 to compare the p value to alpha=0.05 # cast string to float so it can be compared with the alpha value p_value=float(request.args.get("p_value")) test=request.args.get("test") independent_variable=request.args.get("independent_variable") dependent_variable=request.args.get("dependent_variable") # Chi goodness does not have a dependent_variable if not dependent_variable: dependent_variable = "" # Get the survey variable so the test result can be saved and reference the survey survey_id=request.args.get("survey") test_id=request.args.get("test_id") if form.validate_on_submit(): # 'upsert' creates entry if it does not yet exist mongo.db.tests.update_one({"_id": ObjectId(test_id)}, {"$set":{"surveyId" : survey_id, "user" : current_user._id, "title" : form.title.data, "test" : test, "independentVariable" : independent_variable, "dependentVariable" : dependent_variable, "p" : p_value}}, upsert=True) flash("Statistical test saved.", "success") return redirect(url_for('surveys.dashboard', title="Dashboard", survey_id=survey_id)) title=request.args.get("title") if title: # i.e. if test already exists and user is clicking to view/edit it form.title.data = title else: # Set the default title. Users can change this form.title.data = independent_variable + "/" + dependent_variable + ": " + test result = {"test":test, "p":p_value, "alpha":alpha, "iv":independent_variable, "dv":dependent_variable} return render_template("analysis/result.html", result=result, form=form, survey_id=survey_id) # DELETE A statistical test @analysis.route("/analyse///delete", methods=['POST']) @login_required def delete_test(survey_id, test_id): test_obj = mongo.db.tests.find_one_or_404({"_id":ObjectId(test_id)}) if test_obj["user"] != current_user._id: flash("You do not have access to that page", "danger") abort(403) mongo.db.tests.delete_one(test_obj) flash("Test deleted", "success") return redirect(url_for('surveys.dashboard', survey_id=survey_id)) # Give the user a quick overview of stats on the survey data @analysis.route("/quickstats/", methods=['GET']) @login_required def quick_stats(survey_id): file_obj = mongo.db.surveys.find_one_or_404({"_id":ObjectId(survey_id)}) if file_obj["user"] != current_user._id: flash("You do not have access to that page", "danger") abort(403) df = read_file(file_obj["fileName"]) rows = len(df.index) cols = len(df.columns) column_info = parse_data(df); return render_template("analysis/quickstats.html", rows=rows, cols=cols, column_info=column_info, survey_id=survey_id, survey_title=file_obj["title"] ) # Give the user a quick overview of stats on the survey data @analysis.route("/export_tests/", methods=['GET']) @login_required def export_tests(survey_id): file_obj = mongo.db.surveys.find_one_or_404({"_id":ObjectId(survey_id)}) if file_obj["user"] != current_user._id: flash("You do not have access to that survey", "danger") abort(403) tests = mongo.db.tests.find({"surveyId":survey_id}) if tests.count() == 0: flash("You do not yet have any statistical tests for this survey!", "danger") return redirect(url_for('surveys.dashboard', survey_id=survey_id)) # Use a temp file so that it can be deleted after with tempfile.NamedTemporaryFile() as f: # Create a new excel workbook wb = Workbook(f.name) # grab the active worksheet ws = wb.add_worksheet("Statistical tests") tests_to_excel(ws, tests) wb.close() return send_file(f.name, attachment_filename=file_obj["title"] + ".xlsx", as_attachment=True)