import os import secrets import pandas as pd import numpy as np from flask import Flask, current_app # Saves file after import. All files are saved as CSV for easier handling def save_file(form_file): # Split the extension from the fileName. I'm not using the filename so variable name is '_' according to PEP8 _, f_ext = os.path.splitext(form_file.filename) # If not CSV file I will convert it to csv before saving (for easier handling later) if f_ext != ".csv": df = pd.read_excel(form_file, index_col=None) else: df = pd.read_csv(form_file, index_col=None) # Removes empty rows/columns df = remove_nan(df) # Trims white space from strings df = trim_strings(df) # Generate a random hex filename and create the path file_name = generate_filepath() file_path = os.path.join(current_app.root_path, "uploads", file_name) # Save as CSV df.to_csv(file_path, encoding='utf-8', index=False) return file_name # Uses python secrets to generate a random hex token for the file name def generate_filepath(): # Generate a random hex for the filename random_hex = secrets.token_hex(8) file_name = random_hex + ".csv" return file_name def trim_strings(df): trimmed = lambda x: x.strip() if isinstance(x, str) else x return df.applymap(trimmed) # Given a name, locates file and returns the dataframe. def read_file(name): return pd.read_csv(os.path.join(current_app.root_path, "uploads", name)) def delete_file(name): file = os.path.join(current_app.root_path, "uploads", name) os.remove(file) # A function that removes all leading empty rows/columns def remove_nan(df): # Set a flag that is true if the first row is empty (subsequenlty requiring # the header be reset after empty rows are removed) if df.iloc[0].isnull().all(axis = 0): reset_header = True else: reset_header = False # Drops columns data = df.dropna(how = 'all', axis = 1) # Drops rows data = data.dropna(how = 'all', axis = 0) data = data.reset_index(drop = True) # if the first row was empty, we reset the header to be the first row in the data if reset_header: # Get the first row to be set as the new header new_header = data.iloc[0] # Set the remaining data as the dataframe data = data[1:] # Set the new header on the dataframe data.columns = new_header return data # This function loops through each column and collects information on the type of data (numerical vs categorical) # and the number of unique entries in that column. The type of graph that can be used will depend on the type of data. # Will also be useful for suggesting to the user about grouping if there are lots of unique entries. # e.g. if there are 100 different 'ages', can suggest grouping in 10 year batches. def parse_data(df): numerics = [np.int64, np.int32, np.int16, np.int8, np.float64, np.float32, np.float16, np.uint64, np.uint32, np.uint16, np.uint8] column_info = [] for (column_title, column_data) in df.iteritems(): uniques = df[column_title].nunique() temp_dict = { "title": column_title } temp_dict["num_unique"] = df[column_title].nunique() if column_data.dtype == np.bool: temp_dict["data_type"] = "true/false" temp_dict["quantities"] = column_data.value_counts().to_dict() elif column_data.dtype in numerics: temp_dict["data_type"] = "numerical" # Rounded to 4 significant figures so that it can fit on the page temp_dict["standard_deviation"] = float('%.4g' % column_data.std()) temp_dict["average"] = float('%.4g' % column_data.agg("mean")) temp_dict["max"] = column_data.agg("max"); temp_dict["min"] = column_data.agg("min"); temp_dict["sum"] = column_data.agg("sum"); else: # Try to parse it as a date/time. If it fails, it must be an object (categorical data) try: column_data = pd.to_datetime(column_data, dayfirst=True) if (column_data.dt.floor('d') == column_data).all(): temp_dict["data_type"] = "date" elif (column_data.dt.date == pd.Timestamp('now').date()).all(): column_data = column_data.dt.time temp_dict["data_type"] = "time" else: temp_dict["data_type"] = "date/time" temp_dict["num_unique"] = df[column_title].nunique() except ValueError: temp_dict["data_type"] = "categorical" temp_dict["quantities"] = column_data.value_counts().to_dict() column_info.append(temp_dict) return column_info