import pandas as pd
from scipy import stats
import numpy as np
import re

def importCSV(filename, print_bool):
    programs = pd.read_csv(filename, sep=',')
    programs = programs.rename(columns={"hex(sha1(user.username))" : "user"})

    if print_bool:
        print('keys', programs.keys())
        print('first five rows\n', programs.head(5))
        print('Amount of programs', len(programs))
    
    return programs

def dropEmptyPrograms(data, print_bool):
    toDrop = []
    for i in range(len(data)):
        if not isinstance(data['code'][i], str):
            toDrop.append(i)

    if print_bool:
        print('original amount of programs', len(data))
        print(len(toDrop), "programs are empty")

    data = data.drop(toDrop).reset_index()

    if print_bool:
        print('remaining amount of programs', len(data))

    return data

def unique_user_gender(data, print_bool):
    user_gender = data[['user', 'gender']].drop_duplicates()

    if print_bool:
        print('Amount of users', len(user_gender))
        print('Amount of users per gender\n', user_gender.groupby(['gender']).size().reset_index().rename(columns={0:'count'})) 

    return user_gender

def group_user_gender_adventure(data, print_bool):
    programs = data.groupby(['user', 'gender', 'adventure_name']).size().reset_index().rename(columns={0:'count_programs'})
    
    if print_bool:
        print('Amount of programs per gender\n', data.groupby(['gender']).size().reset_index().rename(columns={0:'count'})) 
    
    return programs

def add_keywords_as_column(data):
    data['print'] = np.nan
    data['ask'] = np.nan
    data['forward'] = np.nan
    data['turn'] = np.nan
    data['echo'] = np.nan
    data['is'] = np.nan
    data['sleep'] = np.nan
    data['at random'] = np.nan
    data['add to'] = np.nan
    data['remove from'] = np.nan
    data['if'] = np.nan
    data['else'] = np.nan
    data['in'] = np.nan
    data['unique_keywords'] = np.nan
    data['total_keywords'] = np.nan

    return data

def users_worked_on(user_gender, data, value, print_bool):
    users_adventures = user_gender
    users_adventures["story"] = np.nan
    users_adventures["game"] = np.nan
    users_adventures["visual"] = np.nan

    user_gender_adventure = data
    programs_story = user_gender_adventure.loc[user_gender_adventure['adventure_name'] == 'story']
    users_story = programs_story['user'].unique()
    programs_game = user_gender_adventure.loc[user_gender_adventure['adventure_name'] == 'rock']
    users_game = programs_game['user'].unique()
    programs_visual = user_gender_adventure.loc[user_gender_adventure['adventure_name'] == 'turtle']
    users_visual = programs_visual['user'].unique()

    for user in users_adventures['user']:
        index = users_adventures[users_adventures.user == user].index
        
        if user in users_story:
            if value == 'binary':
                users_adventures.at[index[0], 'story'] = 1 #True
            else:
                users_adventures.at[index[0], 'story'] = programs_story.loc[programs_story['user'] == user]['count_programs']
        else: 
            users_adventures.at[index[0], 'story'] = 0 #False

        if user in users_game:
            if value == 'binary':
                users_adventures.at[index[0], 'game'] = 1 #True
            else:
                users_adventures.at[index[0], 'game'] = programs_game.loc[programs_game['user'] == user]['count_programs']
        else: 
            users_adventures.at[index[0], 'game'] = 0 #False

        if user in users_visual:
            if value == 'binary':
                users_adventures.at[index[0], 'visual'] = 1 #True
            else:
                users_adventures.at[index[0], 'visual'] = programs_visual.loc[programs_visual ['user'] == user]['count_programs']
        else: 
            users_adventures.at[index[0], 'visual'] = 0 #False

    if print_bool:
        print('Users who work on story')
        print(users_adventures.groupby(['gender', 'story']).size().reset_index().rename(columns={0:'count'}))

        print('Users who work on game')
        print(users_adventures.groupby(['gender', 'game']).size().reset_index().rename(columns={0:'count'}))

        print('Users who work on visual')
        print(users_adventures.groupby(['gender', 'visual']).size().reset_index().rename(columns={0:'count'}))
    
    return users_adventures
   
def calculateLOC(data):
    dataLOC = data
    dataLOC['LOC'] = np.nan 

    for i in range(len(dataLOC)):
        if isinstance(dataLOC['code'][i], str):
            dataLOC.at[i, 'LOC']= len(dataLOC['code'][i].splitlines())
        else:
            print('EMPTY PROGRAM AT', i)

    return dataLOC

def calculate_keywords_count(data): 
    for i in range(len(data)):
        code = data['code'][i].splitlines()    

        data.at[i, 'print'] = 0
        data.at[i, 'ask'] = 0
        data.at[i, 'forward'] = 0
        data.at[i, 'turn'] = 0
        data.at[i, 'echo'] = 0
        data.at[i, 'is'] = 0
        data.at[i, 'sleep'] = 0
        data.at[i, 'at random'] = 0
        data.at[i, 'add to'] =  0
        data.at[i, 'remove from'] = 0
        data.at[i, 'if'] = 0
        data.at[i, 'else'] = 0
        data.at[i, 'in'] = 0

        for line in code:
            # at random can be used in level 3-4-5 in combinations with other keywords, therefor check if it is there but evaluate rest of line as well
            if data['level'][i] > 2 and  (' at random' in line or ' op willekeurig' in line or ' en aleatorio' in line or ' 在 随机' in line) :
                data.at[i, 'at random'] = data.at[i, 'at random'] + 1

            # if statements looking at begining of line, continue with next line if one of the conditions is met
            if line.startswith('print') or line.startswith('imprimir') or line.startswith('打印'):
                data.at[i, 'print'] = data.at[i, 'print'] + 1
                continue
            if ((line.startswith('ask') or line.startswith('vraag') or line.startswith('preguntar') or line.startswith('提问')) and data['level'][i] == 1): 
                data.at[i, 'ask'] = data.at[i, 'ask'] + 1
                continue
            if line.startswith('forward') or line.startswith('vooruit') or line.startswith('adelante') or line.startswith('向前'):
                data.at[i, 'forward'] = data.at[i, 'forward'] + 1
                continue
            if line.startswith('turn') or line.startswith('draai') or line.startswith('girar') or line.startswith('返回'):
                data.at[i, 'turn'] = data.at[i, 'turn'] + 1
                continue
            if (line.startswith('echo') or line.startswith('eco') or line.startswith('回声')) and data['level'][i] == 1:
                data.at[i, 'echo'] = data.at[i, 'echo'] + 1
                continue
            if data['level'][i] > 1 and (line.startswith('sleep') or line.startswith('slaap') or line.startswith('dormir') or line.startswith('睡眠')):
                data.at[i, 'sleep'] = data.at[i, 'sleep'] + 1
                continue

            # add to and remove from, if first word is add or remove and to/from is later in line
            if data['level'][i] > 2 and (line.startswith('add') or line.startswith('voeg') or line.startswith('añadir') or line.startswith('加')) and (" to" in line or " toe aan" in line or " a" in line or " 到" in line):
                data.at[i, 'add to'] = data.at[i, 'add to'] + 1
                continue

            if data['level'][i] > 2 and (line.startswith('remove') or line.startswith('verwijder') or line.startswith('borrar') or line.startswith('移除')) and (" from" in line or " uit" in line or " de" in line or " 从" in line):
                data.at[i, 'remove from'] = data.at[i, 'remove from'] + 1
                continue

            # situations were keywords can be in the sentence, and not at start
            # IF ELSE, the if or else is in the beginning of sentence. However, this can be followed by one or mutliple keywords 
            if data['level'][i] > 4 and (line.startswith('if') or line.startswith('als') or line.startswith('si') or line.startswith('如果')):
                data.at[i, 'if'] = data.at[i, 'if'] + 1
                # in level 4 and up, users use quotes to indicate strings. So we can remove the string to only keep keywords (and variables)
                #print('\n',line)
                line = re.sub(r'[\'][^\']+[\']', " ", line)
                line = re.sub(r'[\"][^\"]+[\"]', " ", line)
                #print(line)
                
                data.at[i, 'print'] = data.at[i, 'print'] + line.count('print') + line.count('imprimir') + line.count('打印') 
                data.at[i, 'ask'] = data.at[i, 'ask'] + line.count(' is ask') + line.count(' is vraag') + line.count(' es preguntar') + line.count(' 是 提问') 
                data.at[i, 'forward'] = data.at[i, 'forward'] + line.count('forward') + line.count('vooruit') + line.count('adelante') + line.count('向前') 
                data.at[i, 'turn'] = data.at[i, 'turn'] + line.count('turn') + line.count('draai') + + line.count('girar') + line.count('返回') 
                data.at[i, 'is'] = data.at[i, 'is'] + line.count(' is') + line.count(' es') + line.count(' 是') - (line.count(' is ask') + line.count(' is vraag') + line.count(' es preguntar') + line.count(' 是 提问') )
                data.at[i, 'sleep'] = data.at[i, 'sleep'] + line.count('sleep') + line.count('slaap') + line.count('dormir') + line.count('睡眠') 
                data.at[i, 'else'] = data.at[i, 'else'] + line.count('else') + line.count('anders') + line.count('sino') + line.count('否则') 
                data.at[i, 'in'] = data.at[i, 'in'] + line.count(' in') + line.count(' en') + line.count(' 在里面') 

                continue

            if data['level'][i] > 4 and (line.startswith('else') or line.startswith('anders') or line.startswith('sino') or line.startswith('否则')):
                data.at[i, 'else'] = data.at[i, 'else'] + 1
                # in level 4 and up, users use quotes to indicate strings. So we can remove the string to only keep keywords (and variables)
                #print('\n',line)
                line = re.sub(r'[\'][^\']+[\']', " ", line)
                line = re.sub(r'[\"][^\"]+[\"]', " ", line)
                #print(line)

                data.at[i, 'print'] = data.at[i, 'print'] + line.count('print') + line.count('imprimir') + line.count('打印') 
                data.at[i, 'ask'] = data.at[i, 'ask'] + line.count(' is ask') + line.count(' is vraag') + line.count(' es preguntar') + line.count(' 是 提问') 
                data.at[i, 'forward'] = data.at[i, 'forward'] + line.count('forward') + line.count('vooruit') + line.count('adelante') + line.count('向前') 
                data.at[i, 'turn'] = data.at[i, 'turn'] + line.count('turn') + line.count('draai') + + line.count('girar') + line.count('返回') 
                data.at[i, 'is'] = data.at[i, 'is'] + line.count(' is') + line.count(' es') + line.count(' 是') - (line.count(' is ask') + line.count(' is vraag') + line.count(' es preguntar') + line.count(' 是 提问') )
                data.at[i, 'sleep'] = data.at[i, 'sleep'] + line.count('sleep') + + line.count('slaap') + line.count('dormir') + line.count('睡眠') 
                data.at[i, 'in'] = data.at[i, 'in'] + line.count(' in') + line.count(' en') + line.count(' 在里面') 

                continue

            # is, assignment var/list, na if zodat vergelijking al gecoverd is
            if data['level'][i] > 1 and (" is ask" in line or " is vraag" in line or " es preguntar" in line or  " 是 提问" in line): 
                data.at[i, 'ask'] = data.at[i, 'ask'] + 1
                continue

            if data['level'][i] > 1 and (" is" in line or " es" in line or " 是" in line):
                data.at[i, 'is'] = data.at[i, 'is'] + 1
                continue

            if data['level'][i] > 4 and (" in" in line or " en" in line or " 在里面" in line):
                data.at[i, 'in'] = data.at[i, 'in'] + 1
                continue

        data.at[i, 'total_keywords'] = data.at[i, 'print'] + data.at[i, 'ask'] + data.at[i, 'forward'] + data.at[i, 'turn'] + data.at[i, 'echo'] + data.at[i, 'is'] + data.at[i, 'sleep'] + data.at[i, 'at random'] + data.at[i, 'add to'] + data.at[i, 'remove from'] + data.at[i, 'if'] + data.at[i, 'else'] + data.at[i, 'in']
    
    data['unique_keywords'] = np.count_nonzero(data[['print', 'ask', 'forward', 'turn', 'echo', 'is', 'sleep', 'at random', 'add to','remove from','if', 'else','in']], axis=1)
    return data

 
def column_mean_std(data, column):
    print('___', column, '___')
    print('mean', round(data[column].mean(), 1))
    print('std', round(data[column].std(), 1))
    print('BOYS')
    print('mean', round(data.loc[data['gender'] == 'm'][column].mean(),1))
    print('std', round(data.loc[data['gender'] == 'm'][column].std(),1))
    print('GIRLS')
    print('mean', round(data.loc[data['gender'] == 'f'][column].mean(),1))
    print('std', round(data.loc[data['gender'] == 'f'][column].std(),1))
    print('OTHERS')
    print('mean', round(data.loc[data['gender'] == 'o'][column].mean(),1))
    print('std', round(data.loc[data['gender'] == 'o'][column].std(),1))
    
def chi_worked_on_between(boys, girls, others):
    
    print('boys vs girls\n', stats.chi2_contingency([boys,girls]))
    print(cramer_v([boys,girls]))
    print('boys vs others\n', stats.chi2_contingency([boys,others]))
    print(cramer_v([boys,others]))
    print('girls vs others\n', stats.chi2_contingency([girls,others]))
    print(cramer_v([girls,others]))
    print('boys vs girls vs others\n', stats.chi2_contingency([boys, girls, others]))
    print(cramer_v([boys, girls, others]))

def cramer_v(data):
    data = np.array(data)
    x2 = stats.chi2_contingency(data, correction=False)[0]
    n = np.sum(data)
    minDim = min(data.shape)-1
    V = np.sqrt(x2/(n*minDim))
    return V

def chi_worked_on_within(story, game, visual):
    print('story vs game vs visual\n', stats.chi2_contingency([story, game, visual]))
    print(cramer_v([story,game, visual]))

def ttest_within_gender_adv(data, gender):
    data_gender = data.loc[data['gender'] == gender]
    print('___', gender, '___')
    print("story vs game")
    print(stats.ttest_ind(data_gender['game'], data_gender['story'])) 
    print(cohend(data_gender['game'], data_gender['story']))
    print("story vs visual")
    print(stats.ttest_ind(data_gender['story'], data_gender['visual'])) 
    print(cohend(data_gender['visual'], data_gender['story']))
    print("game vs visual")
    print(stats.ttest_ind(data_gender['game'], data_gender['visual'])) 
    print(cohend(data_gender['game'], data_gender['visual']))

def ttest_within_gender(data, gender, column):
    story = data.loc[(data['adventure_name'] == 'story') & (data['gender'] == gender)]
    game = data.loc[(data['adventure_name'] == 'rock') & (data['gender'] == gender)]
    visual = data.loc[(data['adventure_name'] == 'turtle') & (data['gender'] == gender)]

    print('___', gender, '___')
    print("story vs game")
    print(stats.ttest_ind(story[column], game[column])) 
    print("story vs visual")
    print(stats.ttest_ind(story[column], visual[column])) 
    print("game vs visual")
    print(stats.ttest_ind(game[column], visual[column])) 

def ttest_between_gender_all(data, column):
    boys = data.loc[data['gender'] == 'm']
    girls = data.loc[data['gender'] == 'f']
    others = data.loc[data['gender'] == 'o']

    print('___', column, '___')
    print("boys vs girls")
    print(stats.ttest_ind(boys[column], girls[column])) 
    print(cohend(girls[column], boys[column]))
    print("boys vs others")
    print(stats.ttest_ind(boys[column], others[column])) 
    print(cohend(others[column], boys[column]))
    print("others vs girls")
    print(stats.ttest_ind(others[column], girls[column])) 
    print(cohend(girls[column], others[column]))

def ttest_between_gender(data, adventure, column):
    boys = data.loc[(data['adventure_name'] == adventure) & (data['gender'] == 'm')]
    girls = data.loc[(data['adventure_name'] == adventure) & (data['gender'] == 'f')]
    others = data.loc[(data['adventure_name'] == adventure) & (data['gender'] == 'o')]

    print('___', adventure, '___', column, '___')
    print("boys vs girls")
    print(stats.ttest_ind(boys[column], girls[column])) 
    print(cohend(boys[column], girls[column]))
    print("boys vs others")
    print(stats.ttest_ind(boys[column], others[column])) 
    print(cohend(others[column], boys[column]))
    print("others vs girls")
    print(stats.ttest_ind(others[column], girls[column])) 
    print(cohend(girls[column], others[column]))

def cohend(d1, d2):
 # calculate the size of samples
 n1, n2 = len(d1), len(d2)
 # calculate the variance of the samples
 s1, s2 = np.var(d1), np.var(d2)
 # calculate the pooled standard deviation
 s = np.sqrt(((n1 - 1) * s1 + (n2 - 1) * s2) / (n1 + n2 - 2))
 # calculate the means of the samples
 u1, u2 = np.mean(d1), np.mean(d2)
 # calculate the effect size
 return (u1 - u2) / s
