#! /usr/bin/env python3

import csv
import gambit
import gzip
import json
import os
import os.path
import re

from collections import defaultdict
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd

# Directory with the JSONs generated by perceval
directory = 'jsons/'
# Directory with the JSONs with GitHub repo info
info_directory = 'repos_info'

resultsDict = {}

alias_already = []
alias_name = []
alias_email = []

disambiguation_file = 'disambiguation.csv'

infoDict = { 
    "commits": ("data", "Author"),   # Commits: item["data"]["Author"]
    "issues": ("user", "login"),     # Issues: item["user"]["login"]
    "pulls": ("user", "login")       # Pulls: item["user"]["login"]
}

#bots = ['readmecritic', 'dependabot[bot]', 'release bot', 'leshan-bot', 'snyk-bot', 'emqx-ci-robot', 'renovate[bot]', 'monty-bot', 'dependabot-preview[bot]', 'github actions', 'greenkeeper[bot]', 'weekly-digest[bot]', 'codecov[bot]', 'fossabot', 'jenkins', 'copilot4prs[bot]', 'github-code-scanning[bot]', 'sync-by-unito[bot]']

bots = ['readmecritic', 'release bot', 'emqx-ci-robot', 'github actions', 'fossabot', 'jenkins', 'nrel-bot-2', 'nrel-bot-3', 'nrel-bot-2c', 'nrel-bot-2b', 'gh pages bot']

non_corporate = ['.user', 'gmail.com', 'googlemail.com', 'github.com', 'yahoo.com', 'hotmail.com', 'outlook.com', 'qq.com', 'foxmail.com', 'protonmail.com', 'free.fr', '.local', '163.com', 'gmx.net', '126.com', 'users.sourceforge.net', 'synk.io', 'web.de', 'live.com', 'gmx.de', 'localhost', 'sina.com', 'mail.ru', 'icloud.com', 'outlook.de', 'hotmail.it', 'localhost.localdomain', 'hotmail.de', 'yahoo.it', 'yahoo.fr', 'sina.cn']

prog = re.compile(r"^(.*)-\d{8}-(.*).json.gz$")


def find_corporate(email_list):
    """
    Given a list of e-mails, removes from it:
        - duplicates
        - those elements that are not an email
        - non-corporate e-mail domains
    """
    not_email = set()
    
    for email in email_list:
        if not isinstance(email, str):
            continue
        if '@' not in email:
            not_email.add(email)
            continue
        for domain in non_corporate:
            if email.endswith(domain):
                not_email.add(email)
                break
    returnList = set(email_list) - not_email
    return list(returnList)
    
def domain_frequency(email_list):
    """
    Given a list of e-mails returns a dictionary with frequency of domains
    Returns a dictionary:
        key: domain (e.g., urjc.es)
        value: number of appearances
    """
    freqDict = defaultdict(int)
    for email in email_list:
        try:
            username, domain = email.split('@')
        except ValueError:
            continue
        freqDict[domain] += 1
    return freqDict
    
def is_bot(author_name):
    """
    given a name
    returns True if bot
    """
    if '[bot]' in author_name:
        return True
    elif '-bot' in author_name:
        return True
    elif author_name in bots:
        return True
    else:
        return False

def to_csv(repo, commits, issues, pulls):
    """
    Prints repository stats in CSV format
    """
    print("{}, {}, {}, {}, {}, {}, {}".format(repo, commits[0], commits[1], issues[0], issues[1], pulls[0], pulls[1]))
    
def author_split(author):
    """
    Given an email address "Name Surname <email@domain.org>
    Returns the name ("Name Surname") and email (email@domain.org)
    If the input is "user" it returns as name "user" and as email "user@github.user"
    """
    if '<' in author:
        author = author.replace('>', '')
        name, email = author.split('<')
        return name.strip(), email
    elif ' ' in author:
        return (author, author.replace(' ', author.replace(' ', '_') + '@another.user'))
    else:
        return (author, author + '@github.user')
        
def get_author_id(name):
    """
    Given a name of a developer, returns its disambiguation id
    """
    try:
        return disamb[disamb.alias_name == name].get("author_id").iloc()[0]
    except IndexError:
        try:
            print("Error", "get_author_id", "Name", name, "---")
        except UnicodeEncodeError:
            print("UnicodeEncodeError")
        return None

def get_name(author_id):
    """
    Given a disambiguation id, returns the name of the developer
    """
    try:
        return disamb[disamb.author_id == author_id].get("alias_name").iloc()[0]
    except IndexError:
        print("Error", "get_name", "Author_id", author_id, "---")
        return None
        
def add_contribution(contributiondict):
    """
    Given a dictionary with contributions:
        key: name
        value: number of contributions
    
    Returns a dictionary with 
        key: name
        value: added the new contributions
    """
    for key, value in contributiondict.items():
        authorContrib[key] += value
        
def add_edges(contributiondict):
    """
    given a dictionary of contributions to a repository:
        key: name
        
    returns a list of tuples (contributor1, contributor2) that are linked
    """
    returnList = []
    for key1 in contributiondict:
        for key2 in contributiondict:
            if key1 < key2:
                returnList.append((key1, key2))
    return returnList

def add_affiliation_edges(contributions, affiliations, companies, threshold = 0):
    """
    contributions dict: Key: author_id     Value: number of contributions
    affiliations dict: Key: author_id    (e.g., 88) Value: company domain  (e.g., ibm.com)
    companies dict: Key: company_id  (e.g., 43) Value: company domain  (e.g., ibm.com)
    threshold gives the number of contributions required to be considered
    """
    contributors = []
    for author_id in contributions:
        if contributions[author_id] > threshold:
            if author_id in affiliations:
                domain = affiliations[author_id]
                company_id = companies[domain]
                contributors.append(company_id)
            
    returnList = []
    for company1 in contributors:
        for company2 in contributors:
            if company1 < company2:
                returnList.append((company1, company2))
    return returnList


def get_domain(email):
    """
    given an email returns its second-level domain + TLD
    (e.g., john.smith@two.foo.bar) returns foo.bar
    """
    if not isinstance(email, str):
        return ".user"
    username, domain = email.rsplit('@', 1)
    domain = '.'.join(domain.rsplit('.', 2)[-2:])
    return domain


def filename2repo(filename):
    """
    Given a filename, returns the (owner, reponame) tuple
    """
    pass

def get_license(filename):
    with open(info_directory + "/" + filename) as json_file:
        repodata = json.load(json_file)
        if repodata['data']['license']:
            license = repodata['data']['license']['spdx_id']
        else:
            license = "N/A"
        repo_name = repodata['origin'].replace("https://github.com/", "")
        stargazers = repodata['data']['stargazers_count']
        watchers = repodata['data']['watchers_count']
        forks = repodata['data']['forks']
        created = repodata['data']['created_at']
        updated = repodata['data']['updated_at']
    return repo_name, license, stargazers, watchers, forks, created, updated

superAuthorList = []
superAuthorDict = defaultdict(list)
superAuthorDictM = defaultdict(list)

for filename in os.listdir(directory):
    if not filename.endswith(".json.gz"):
        continue
    if filename == "already.json.gz":
        continue
    if filename.startswith("github-repos-"):
        continue

    repo, source = prog.match(filename).groups()

    number_of_items = 0
    authorDict = defaultdict(int)
    with gzip.open(directory + filename, 'rb') as f:
        print(filename)
        jsonStr = ''
        for bline in f:
            line = bline.decode("utf-8") # Convert bytes into string
            if line == "}{\n" or line == "}":
                jsonStr += "}"
                item = json.loads(jsonStr)
                number_of_items += 1
                author = item[infoDict[source][0]][infoDict[source][1]]
                name, email = author_split(author.lower())
                if (name, email) not in alias_already:
                    alias_name.append(name)
                    alias_email.append(email)
                    alias_already.append((name, email))

                if name not in superAuthorList:
                    superAuthorList.append(name)
                if repo not in superAuthorDict[name]:
                    superAuthorDict[name].append(repo)
                authorDict[name] += 1
                if source in ("issues", "pulls"):
                    for comment in item["comments_data"]:
                        name = comment["user"]["login"]
                        name, email = author_split(name.lower())
                        if (name, email) not in alias_already:
                            alias_name.append(name)
                            alias_email.append(email)
                            alias_already.append((name, email))
                        authorDict[name] += 1
                jsonStr = "{"
            else:
                jsonStr += line
    if repo not in resultsDict:
        resultsDict[repo] = {}            
    resultsDict[repo][source] = (number_of_items, authorDict)
#    print(repo, source, number_of_items, set(authorList), authorDict)

if not os.path.isfile(disambiguation_file):
    aliases = pd.DataFrame({'alias_name': alias_name, 'alias_email': alias_email})
    disamb = gambit.disambiguate_aliases(aliases)   
    with open(disambiguation_file, 'w') as output:
        output.write(disamb.to_csv())
else:
    disamb = pd.read_csv(disambiguation_file)

#####
# Get a list of (unique) repos for each author (not bot)
for name in superAuthorDict:
    if not is_bot(name):
        author_id = get_author_id(name)
        if author_id:
            superAuthorDictM[author_id] += superAuthorDict[name]
for elem in superAuthorDictM:
    superAuthorDictM[elem] = list(set(superAuthorDictM[elem]))


###
# Find authors with many repos (and a distribution of repos per author)
reposPerAuthor = {}
reposPerAuthorDistribution = defaultdict(int)
#print(len(set(superAuthorList)), set(superAuthorList))
for author_id in superAuthorDictM:
    reposPerAuthorDistribution[len(set(superAuthorDictM[author_id]))] += 1
    if len(set(superAuthorDictM[author_id])) > 6:
        print(author_id, get_name(author_id), len(superAuthorDictM[author_id]), superAuthorDictM[author_id])
        reposPerAuthor[get_name(author_id)] = len(superAuthorDictM[author_id])

reposPerAuthor_sorted = sorted(reposPerAuthor.items(), key=lambda x:x[1])
print("reposPerAuthor:", reposPerAuthor_sorted)
print(dict(sorted(reposPerAuthorDistribution.items())))

###
#
emails = find_corporate(alias_email)
print("Number of emails and alias", len(emails), len(alias_email))
print("Alias frequency", sorted(domain_frequency(list(set(alias_email))).items(), key=lambda x:x[1]))
print("Emails frequency", sorted(domain_frequency(emails).items(), key=lambda x:x[1]))

###
# Disambiguation of resultsDict
#
# disamb_resultsDict:
# key: repo
# value: dict
#      key: source (commits, issues or pulls)
#      value: tuple with total number of actions and authorDict
#           key: author_id
#           value: number of actions
disamb_resultsDict = {}
for repo in resultsDict:
    disamb_resultsDict[repo] = {}
    for source in ('commits', 'issues', 'pulls'):
        count, authorsDict = resultsDict[repo][source]
        disamb_authorsDict = {}
        for name in authorsDict:
            author_id = get_author_id(name)
            if author_id is None:
                continue
            if author_id not in disamb_authorsDict:
                disamb_authorsDict[author_id] = authorsDict[name]
            else:
                disamb_authorsDict[author_id] += authorsDict[name]
        disamb_resultsDict[repo][source] = (count, disamb_authorsDict)


###
# Contributions to all repos by authors
authorContrib = defaultdict(int)
for repo in disamb_resultsDict:
    try:
        commits = disamb_resultsDict[repo]["commits"]
        add_contribution(commits[1])
    except KeyError:
        commits = ("-", "-")
    try:
        issues = disamb_resultsDict[repo]["issues"]
        add_contribution(issues[1])
    except KeyError:
        issues = ("-", "-")
    try:
        pulls = disamb_resultsDict[repo]["pulls"]
        add_contribution(pulls[1])
    except KeyError:
        pulls = ("-", "-")
#    to_csv(repo, commits, issues, pulls)
print("authorContrib", authorContrib)
print("Total number of authors:", len(authorContrib))
print("Sum of contributions:", sum(list(authorContrib.values())))


###
# Let's look for the affiliation of authors
# affiliations dict
# Key: author_id    (e.g., 88)
# Value: company domain  (e.g., ibm.com)
affiliation_dict = defaultdict(int)
for index in range(0, len(disamb)):
    df = disamb[disamb.author_id == index]
    email_list = []
    for i, b in df.iterrows():
        email_list.append(df.loc[i, "alias_email"])
        
    email_list = find_corporate(email_list)
    if len(email_list) == 0:
        continue
    if len(email_list) > 1:
        email_list = [email_list[1]]
    domain = get_domain(email_list[0])
    affiliation_dict[index] = domain

print("affiliation_dict", affiliation_dict, len(affiliation_dict))
number = 0
for author in affiliation_dict:
    number += authorContrib[author]
print("Sum of contributions:", number)
"""
# companies dict
# Key: company domain  (e.g., ibm.com)
# Value: company_id  (e.g., 43)
companies_dict = {}
for value in affiliation_dict.values():
    if value not in companies_dict:
        companies_dict[value] = len(companies_dict)

# print(companies_dict, len(companies_dict))

##
# Create a network of authors
#network_list = []
#for repo in disamb_resultsDict:
#    for source in ('commits', 'issues', 'pulls'):
#        try:
#            action = disamb_resultsDict[repo][source]
#            network_list.append(add_edges(action[1]))
#        except KeyError:
#            commits = ("-", "-")


##
# Create a network of companies
network = []
for repo in disamb_resultsDict:
    repo_network = []
    for source in ('commits', 'issues', 'pulls'):
        try:('telenordigital.com', 9), ('tencent.com', 9), ('eurecom.fr', 43)
            action = disamb_resultsDict[repo][source]
            repo_network += add_affiliation_edges(action[1], affiliation_dict, companies_dict, threshold = 4)
        except KeyError:
            action = ("-", "-")
    # print(list(set(repo_network)))    
    network += list(set(repo_network))

print("Affiliation Dict:", affiliation_dict)
print("Companies Dict:", companies_dict)

with open('company_network.txt', 'w') as network_file:
    previous = (None, None)
    count = 0
    for comp_tuple in sorted(network):
        if comp_tuple == previous:
            count += 1
        else:
            network_file.write("{} {} {}\n".format(*comp_tuple, count))
            previous = comp_tuple
            count = 1
    network_file.write("{} {} {}\n".format(*comp_tuple, count)) # last one

# Draw the network
G = nx.read_weighted_edgelist('company_network.txt')
print("Density:", nx.density(G))
print("Average clustering:", nx.average_clustering(G))
print("Degree histogram:", nx.degree_histogram(G))
print("Degree centrality:", nx.degree_centrality(G))
print("Betweenness centrality:", nx.betweenness_centrality(G))
pos = nx.spring_layout(G)
nx.draw_networkx(G, pos=pos, node_size=4, arrowsize=1, with_labels=True)
plt.draw()
plt.show()
"""

##############################################################################################
## Projects CSV
with open('projects.csv', 'w') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(["Repository", "License", "Stars", "Watchers", "Forks", "Created", "Updated", "Commits", "Issues", "Pull-Requests", "Committers", "Percent", "Top Committer Name", "Top Commiter Affiliation"])
    for filename in os.listdir(info_directory):
        csv_list = []
        repo_name, license, stargazers, watchers, forks, created, updated = get_license(filename)
        csv_list += [repo_name, license, stargazers, watchers, forks, created, updated]
     #        print(disamb_resultsDict[repo_name.replace('/', '_')])
        number_commits = disamb_resultsDict[repo_name.replace('/', '_')]["commits"][0]
        number_issues = disamb_resultsDict[repo_name.replace('/', '_')]["issues"][0]
        number_prs = disamb_resultsDict[repo_name.replace('/', '_')]["pulls"][0]
        authorsD = disamb_resultsDict[repo_name.replace('/', '_')]["commits"][1]
        number_authors = len(authorsD)
        max_id, max_name, max_affiliation, percent = "N/A", "N/A", "N/A", "N/A"
        if number_commits: max_id = max(authorsD, key=authorsD.get)
        if number_commits: percent = round(authorsD[max_id]/number_commits * 100)
        if number_commits: max_name = get_name(max_id)
        if number_commits: max_affiliation = affiliation_dict[max_id]
        if not max_affiliation: max_affiliation = "N/A"
    #        print(number_commits, number_issues, number_prs)
        csv_list += [number_commits, number_issues, number_prs]
    #        print(authorsD, number_authors, max_id, percent, max_name, max_affiliation)
        csv_list += [number_authors, percent, max_name, max_affiliation]
#        except KeyError:
#               csv_list += "KeyError"
        csv_writer.writerow(csv_list)

    
##############################################################################################
## Developers CSV
with open('developers.csv', 'w') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(["Repository", "License", "Commits", "NumberCommitters", "CommitterName", "CommiterAffiliation", "CommiterCommits", "CommitterName", "CommiterAffiliation", "CommiterCommits", "CommitterName", "CommiterAffiliation", "CommiterCommits"])
    for filename in os.listdir(info_directory):
        csv_list = []
        repo_name, license, stargazers, watchers, forks, created, updated = get_license(filename)
        csv_list += [repo_name, license]
        total_number_commits = disamb_resultsDict[repo_name.replace('/', '_')]["commits"][0]
        authorsD = disamb_resultsDict[repo_name.replace('/', '_')]["commits"][1]
        number_authors = len(authorsD)
        csv_list += [total_number_commits, number_authors]
#        print("authorsD", authorsD)
        for author_id in authorsD:
#            print(get_name(author_id), affiliation_dict[author_id], authorsD[author_id])
            csv_list += [get_name(author_id), affiliation_dict[author_id], authorsD[author_id]]
        csv_writer.writerow(csv_list)
