__author__ = 'Kevin Kruijthoff'

"""--------------------------------------------------------------------------------------------------------------------

Author:     Kevin Kruijthoff

Date:       23 Jul 2015

File:       Main_v1.py

Version:    1.0

Includes:   - Importing patent data from input JSON file
              • Patent attributes should have a boolean (0 or 1) value
            - Storing of the individual patents
            - Iterate the individual patents using the COBWEB* algorithm and store the categorical hierarchical relation
              in the nodes
            - * The COBWEB algorithm contains add to existing category, adding as new category, merging and splitting
            - Export hierarchy from the nodes and data from the patents to JSON output file

Manual:     Settings are explained in code, for complete documentation see Thesis appendix

--------------------------------------------------------------------------------------------------------------------"""


"""--------------------------------------------------------------------------------------------------------------------
Software user defined settings
"""
# Input & Output file
inputfile =  'The_name_of_the_indexed_patent_file.json' # The Indexed JSON file resulting from the text-mining software
outputfile = 'Name_to_give_the_to_be_created_file.json' # JSON file to store the hierarchy, it will be created if it does not exist

# COBWEB Operators (See Thesis for explanation of operators) - Adding to existing node is always used
operator_new = True                 # Allow the addition of new categorical nodes
operator_merge = True               # Allow the merging of nodes
operator_split = True               # Allow the splitting of nodes

# Visualization parameters
displayAttributes = True            # Do or do not include the visualization of attributes at nodes
displayType = True                  # True = relative / False = absolute
display_ratio_number = 10           # Used in the mouse hover in the visualization
display_ratio_titles = 3            # Number of attributes shown at the node in visualization

# Display thresholds for CTR and TBR (See Thesis for explanation)
CarryThroughThreshold = 0.95        # Equal to or larger than
TraceBackTreshold     = 1.05        # Equal to or smaller than


"""--------------------------------------------------------------------------------------------------------------------
Imports
"""
import json
import sys
import datetime
from collections import OrderedDict


"""--------------------------------------------------------------------------------------------------------------------
Software debug settings
"""
d = datetime.datetime.now()         # Determine the starting time of the software run
print("start: " + str(d))           # Print the starting time at the start of the run
sys.setrecursionlimit(1800)         # Allowable recursion depth


"""--------------------------------------------------------------------------------------------------------------------
Class that stores patents
"""

class Patent():
    patent_id_counter = 0
    patents_total_counts = {}

    def __init__(self, input_id, input_patent_title, input_patent_original_id, input_company_id, input_attribute_counts):
        self.patent_unique_id = self.new_id()
        self.patent_input_id = input_id
        self.patent_input_title = input_patent_title
        self.patent_input_patent_original_id = input_patent_original_id
        self.patent_input_company_id = input_company_id
        self.patent_node_id = None
        self.patent_attributes = input_attribute_counts

        for patent_attribute, patent_attribute_value in self.patent_attributes.items():
            if patent_attribute in self.__class__.patents_total_counts:
                self.__class__.patents_total_counts[patent_attribute] = self.__class__.patents_total_counts[
                                                                            patent_attribute] + patent_attribute_value
            else:
                self.__class__.patents_total_counts[patent_attribute] = patent_attribute_value

        node.append(Node("new", self, None, None, None, None))

    def new_id(self):
        new_patent_id = self.__class__.patent_id_counter
        self.__class__.patent_id_counter += 1
        return new_patent_id


"""--------------------------------------------------------------------------------------------------------------------
Class that stores nodes
"""

class Node():
    node_id_counter = 0
    tree_root = None
    patents_number = 0
    patents_total_counts = {}

    def __init__(self, node_type, new_patent_object, add_node_root, new_attributes, new_children, new_patent_count):
        self.node_unique_id = self.new_id()
        self.node_root = None
        self.node_children = []
        self.node_attributes = {}
        self.node_patents = []
        self.node_patents_number = 0

        if node_type == "tree root":
            # Make the first node into the tree root else start categorization
            if self.__class__.tree_root == None:
                self.__class__.tree_root = self
        else:

            if node_type=="add node to concept" or node_type=="merge node":
                self.node_root = add_node_root
                self.node_root.node_children.append(self)

            if node_type=="add node to concept" or node_type=="new":

                self.node_patents_number += 1
                self.node_patents.append(new_patent_object)

                for attr_key, attr_value in new_patent_object.patent_attributes.items():
                    if attr_key in self.node_attributes:
                        self.node_attributes[attr_key] = self.node_attributes[attr_key] + attr_value
                    else:
                        self.node_attributes[attr_key] = attr_value

            if node_type == "new":
                self.__class__.patents_number += 1
                self.cobweb(self.__class__.tree_root)

            if node_type == "merge node":
                for attr_key, attr_value in new_attributes.items():
                    self.node_attributes[attr_key] = attr_value
                self.node_children = new_children
                self.node_patents_number = new_patent_count
                for item in self.node_children:
                    item.node_root = self


    def new_id(self):
        new_node_id = self.__class__.node_id_counter
        self.__class__.node_id_counter += 1
        return new_node_id


    def cobweb(self, search_root):
        if not search_root.node_children:

            # add this node under the given root
            self.node_root = search_root
            search_root.node_children.append(self)
            self.add_attribute_counts(search_root, self.node_attributes)

            # If the root is the tree root do nothing and add the first leaf otherwise create new node and add leaf shifting down the patent
            if search_root.node_root != None:
                node.append(Node("add node to concept",search_root.node_patents[0],search_root, None, None, None))
                search_root.node_patents = []

        else:
            # check for adding to existing
            temp_total = {}
            temp_total_patent_number = 0
            temp_count_children = 0
            temp_best = None
            temp_second_best = None
            temp_best_cu = None
            temp_second_best_cu = None

            # add the counts of the root to the temporary hash
            for attr_key, attr_value in search_root.node_attributes.items():
                if attr_key in temp_total:
                    temp_total[attr_key] = temp_total[attr_key] + attr_value
                else:
                    temp_total[attr_key] = attr_value

            # add the counts of the new patent to the temporary hash
            for attr_key, attr_value in self.node_attributes.items():
                if attr_key in temp_total:
                    temp_total[attr_key] = temp_total[attr_key] + attr_value
                else:
                    temp_total[attr_key] = attr_value

            # count the number of patents in the children combined
            for x in search_root.node_children:
                temp_total_patent_number += x.node_patents_number
                temp_count_children += 1

            # add one patent as the virtual patent
            temp_total_patent_number += 1

            # calculate the expected number of correct guesses with no knowledge - equal for all children categories
            expected_correct_guesses = 0
            for attr_key, attr_value in temp_total.items():
                expected_correct_guesses += (attr_value / temp_total_patent_number) * (attr_value / temp_total_patent_number)
                expected_correct_guesses += (1 - (attr_value / temp_total_patent_number)) * (1 - (attr_value / temp_total_patent_number))

            # calculate the expected number of attribute values that can be correctly guessed
            for trybranch in search_root.node_children:
                temp_branch_sum = 0

                for branch in search_root.node_children:
                    temp_branch_counts = {}
                    temp_branch_patent_number = 0
                    temp_branch_patent_number += branch.node_patents_number

                    if trybranch == branch:
                        temp_branch_patent_number += 1

                    temp_branch_probability = temp_branch_patent_number / temp_total_patent_number

                    for attr_key, attr_value in branch.node_attributes.items():
                        temp_branch_counts[attr_key] = attr_value

                    if trybranch == branch:
                        for attr_key, attr_value in self.node_attributes.items():
                            temp_branch_counts[attr_key] += attr_value

                    predicted_correct_guesses = 0
                    for attr_key, attr_value in temp_branch_counts.items():
                        predicted_correct_guesses += (attr_value / temp_branch_patent_number) * (attr_value / temp_branch_patent_number)
                        predicted_correct_guesses += (1 - (attr_value / temp_branch_patent_number)) * (1 - (attr_value / temp_branch_patent_number))

                    temp_branch_sum += temp_branch_probability * (predicted_correct_guesses - expected_correct_guesses)

                temp_branch_cu = temp_branch_sum / temp_count_children

                if temp_best_cu == None:
                    temp_best_cu = temp_branch_cu
                    temp_best = trybranch
                else:
                    if temp_branch_cu > temp_best_cu:
                        temp_second_best_cu = temp_best_cu
                        temp_best_cu = temp_branch_cu
                        temp_second_best = temp_best
                        temp_best = trybranch
                    else:
                        if temp_second_best_cu == None:
                            temp_second_best_cu = temp_branch_cu
                            temp_second_best = trybranch
                        else:
                            if temp_branch_cu > temp_best_cu:
                                temp_second_best_cu = temp_branch_cu
                                temp_second_best = trybranch

            # check for adding new
            temp_new_branch_probability = 0
            temp_new_branch_probability += 1/temp_total_patent_number
            temp_new_branch_count_children = temp_count_children + 1
            new_predicted_correct_guesses = 0
            for attr_key, attr_value in self.node_attributes.items():
                new_predicted_correct_guesses += attr_value * attr_value
                new_predicted_correct_guesses += (1 - attr_value) * (1 - attr_value)

            temp_new_sum = 0
            temp_new_sum += temp_new_branch_probability * (new_predicted_correct_guesses - expected_correct_guesses)

            for branch in search_root.node_children:
                temp_branch_counts = {}
                temp_branch_patent_number = 0
                temp_branch_patent_number += branch.node_patents_number
                temp_branch_probability = temp_branch_patent_number / temp_total_patent_number

                for attr_key, attr_value in branch.node_attributes.items():
                    temp_branch_counts[attr_key] = attr_value

                predicted_correct_guesses = 0
                for attr_key, attr_value in temp_branch_counts.items():
                    predicted_correct_guesses += (attr_value / temp_branch_patent_number) * (attr_value / temp_branch_patent_number)
                    predicted_correct_guesses += (1 - (attr_value / temp_branch_patent_number)) * (1 - (attr_value / temp_branch_patent_number))

                temp_new_sum += temp_branch_probability * (predicted_correct_guesses - expected_correct_guesses)

            temp_new_cu = temp_new_sum / temp_new_branch_count_children

            # Check for merging
            if temp_best != None and temp_second_best != None:
                merging_possible = True
                temp_merge_branch_probability = 0
                temp_merge_branch_patent_number = temp_best.node_patents_number + temp_second_best.node_patents_number + 1
                temp_merge_branch_probability += temp_merge_branch_patent_number/temp_total_patent_number
                temp_merge_branch_count_children = temp_count_children - 1
                temp_merge_sum = 0

                for branch in search_root.node_children:
                    if branch != temp_best and branch != temp_second_best:
                        temp_merge_attributes = {}
                        for attr_key, attr_value in branch.node_attributes.items():
                            temp_merge_attributes[attr_key] = attr_value
                        merge_predicted_correct_guesses = 0
                        for attr_key, attr_value in temp_merge_attributes.items():
                            merge_predicted_correct_guesses += (attr_value / temp_merge_branch_patent_number) * (attr_value / temp_merge_branch_patent_number)
                            merge_predicted_correct_guesses += (1 - (attr_value / temp_merge_branch_patent_number)) * (1 - (attr_value / temp_merge_branch_patent_number))
                        temp_merge_sum += (branch.node_patents_number/temp_total_patent_number) * (merge_predicted_correct_guesses - expected_correct_guesses)

                temp_merge_attributes = {}
                for attr_key, attr_value in temp_best.node_attributes.items():
                    temp_merge_attributes[attr_key] = attr_value
                for attr_key, attr_value in temp_second_best.node_attributes.items():
                    temp_merge_attributes[attr_key] += attr_value
                for attr_key, attr_value in self.node_attributes.items():
                    temp_merge_attributes[attr_key] += attr_value

                merge_predicted_correct_guesses = 0
                for attr_key, attr_value in temp_merge_attributes.items():
                    merge_predicted_correct_guesses += (attr_value / temp_merge_branch_patent_number) * (attr_value / temp_merge_branch_patent_number)
                    merge_predicted_correct_guesses += (1 - (attr_value / temp_merge_branch_patent_number)) * (1 - (attr_value / temp_merge_branch_patent_number))

                temp_merge_sum += temp_merge_branch_probability * (merge_predicted_correct_guesses - expected_correct_guesses)
                temp_merge_cu = temp_merge_sum / temp_merge_branch_count_children

            else:
                merging_possible = False

            # Check for splitting
            if temp_best != None and temp_best.node_patents_number > 1:
                splitting_possible = True
                temp_split_branch_count_children = 0
                temp_split_branch_count_children += temp_count_children
                temp_split_branch_count_children -= 1

                for item in temp_best.node_children:
                    temp_split_branch_count_children += 1

                temp_split_best = None
                temp_split_best_cu = None
                temp_split_store_sum = 0

                for branch in search_root.node_children:
                    if branch != temp_best:
                        temp_split_branch_patent_number = branch.node_patents_number
                        temp_split_attributes = {}
                        for attr_key, attr_value in branch.node_attributes.items():
                            temp_split_attributes[attr_key] = attr_value
                        split_predicted_correct_guesses = 0
                        for attr_key, attr_value in temp_split_attributes.items():
                            split_predicted_correct_guesses += (attr_value / temp_split_branch_patent_number) * (attr_value / temp_split_branch_patent_number)
                            split_predicted_correct_guesses += (1 - (attr_value / temp_split_branch_patent_number)) * (1 - (attr_value / temp_split_branch_patent_number))
                        temp_split_branch_probability = 0
                        temp_split_branch_probability += branch.node_patents_number/temp_total_patent_number
                        temp_split_store_sum += temp_split_branch_probability * (split_predicted_correct_guesses - expected_correct_guesses)

                for trybranch in temp_best.node_children:

                    temp_split_sum = 0

                    for branch in temp_best.node_children:
                        temp_split_branch_patent_number = branch.node_patents_number
                        temp_split_attributes = {}
                        for attr_key, attr_value in branch.node_attributes.items():
                            temp_split_attributes[attr_key] = attr_value
                        if branch == trybranch:
                            temp_split_branch_patent_number += 1
                            for attr_key, attr_value in self.node_attributes.items():
                                temp_split_attributes[attr_key] += attr_value
                        split_predicted_correct_guesses = 0
                        for attr_key, attr_value in temp_split_attributes.items():
                            split_predicted_correct_guesses += (attr_value / temp_split_branch_patent_number) * (attr_value / temp_split_branch_patent_number)
                            split_predicted_correct_guesses += (1 - (attr_value / temp_split_branch_patent_number)) * (1 - (attr_value / temp_split_branch_patent_number))
                        temp_split_branch_patent_number = branch.node_patents_number
                        if branch == trybranch:
                            temp_split_branch_patent_number += 1
                        temp_split_branch_probability = 0
                        temp_split_branch_probability += temp_split_branch_patent_number/temp_total_patent_number
                        temp_split_sum += temp_split_branch_probability * (split_predicted_correct_guesses - expected_correct_guesses)

                    temp_split_cu = (temp_split_store_sum + temp_split_sum) / temp_split_branch_count_children

                    if temp_split_best == None:
                        temp_split_best = trybranch
                        temp_split_best_cu = temp_split_cu
                    else:
                        if temp_split_cu > temp_split_best_cu:
                            temp_split_best = trybranch
                            temp_split_best_cu = temp_split_cu

            else:
                splitting_possible = False

            # Checks for best possible CU
            max_cu_value = temp_best_cu
            max_cu = "child"
            if (temp_new_cu >= max_cu_value) and (operator_new == True):
                max_cu_value = temp_new_cu
                max_cu = "new"
            if merging_possible and (temp_merge_cu > max_cu_value) and (operator_merge == True):
                max_cu_value = temp_merge_cu
                max_cu = "merge"
            if splitting_possible and (temp_split_best_cu > max_cu_value) and (operator_split == True):
                max_cu_value = temp_split_best_cu
                max_cu = "split"

            # Check if adding this node as a new category is better than putting it under an existing category
            if max_cu == "new":
                self.node_root = search_root
                search_root.node_children.append(self)
                self.add_attribute_counts(search_root, self.node_attributes)

            # Check if adding this node as a new category is better than putting it under an existing category
            elif max_cu == "merge":
                merge_patent_count = temp_best.node_patents_number + temp_second_best.node_patents_number
                merge_attributes = {}

                for attr_key, attr_value in temp_best.node_attributes.items():
                    merge_attributes[attr_key] = attr_value
                for attr_key, attr_value in temp_second_best.node_attributes.items():
                    merge_attributes[attr_key] += attr_value

                merge_children = []
                merge_children.append(temp_best)
                merge_children.append(temp_second_best)

                node.append(Node("merge node", None, search_root, merge_attributes, merge_children, merge_patent_count))

                search_root.node_children.remove(temp_best)
                search_root.node_children.remove(temp_second_best)

                self.cobweb(temp_best)

            # If splitting is the best option
            elif max_cu == "split":
                for item in temp_best.node_children:
                    search_root.node_children.append(item)
                    item.node_root = search_root

                search_root.node_children.remove(temp_best)
                self.cobweb(search_root)

            # Go down to most fitting branch
            elif max_cu == "child":
                self.cobweb(temp_best)

            # Catching problem
            else:
                print("This should never happen!")


    def add_attribute_counts(self, start_node, count_hash):
        start_node.node_patents_number += 1
        for attr_key, attr_value in count_hash.items():
            if attr_key in start_node.node_attributes:
                start_node.node_attributes[attr_key] = start_node.node_attributes[attr_key] + attr_value
            else:
                start_node.node_attributes[attr_key] = attr_value
        if start_node.node_root != None:
            self.add_attribute_counts(start_node.node_root, count_hash)


    def create_json_structure(self):
        output_json = self.loop_items(self)
        return output_json


    def loop_items(self, from_this_node):
        output_node = {}

        # Check if from_this_node is a leaf
        if not from_this_node.node_children:
            output_node["patent"] = from_this_node.node_patents[0].patent_input_id
            output_node["patent_original_id"] = from_this_node.node_patents[0].patent_input_patent_original_id
            output_node["patent_title"] = from_this_node.node_patents[0].patent_input_title
            output_node["company"] = from_this_node.node_patents[0].patent_input_company_id

        # If it is not a leaf run loop_items for all children
        else:
            kids = []
            for item in from_this_node.node_children:
                # This nested loop will explore all the branches to the end
                kids.append(self.loop_items(item))
            output_node["children"] = kids

            # Don't do for the tree root as it has all at 100%
            if from_this_node != self.__class__.tree_root:

                # Define the category characteristics for display
                category_attributes = ""
                category_attributes += "patents: " + str(from_this_node.node_patents_number) + " / "

                temp_ratio_store = {}

                for attr_key, attr_value in from_this_node.node_attributes.items():
                    if displayType == True:
                        temp_ratio = from_this_node.node_attributes[attr_key] / self.__class__.tree_root.node_attributes[attr_key]
                    else:
                        temp_ratio = from_this_node.node_attributes[attr_key] / from_this_node.node_patents_number
                    if round(temp_ratio,3) > 0:
                        temp_ratio_store[attr_key] = (round(temp_ratio,3)*100)

                temp_ratio_count = 0
                temp_ratio_store_ordered = OrderedDict(sorted(temp_ratio_store.items(), key=lambda kv: kv[1], reverse=True))
                output_node["name"] = ""

                for attr_key, attr_value in temp_ratio_store_ordered.items():
                    category_attributes += str(attr_key) + "(" + str(round(attr_value,1)) + "% / P " + str(round((from_this_node.node_attributes[attr_key]/from_this_node.node_patents_number),2)) + "), "
                    temp_ratio_count += 1
                    if temp_ratio_count >= display_ratio_number:
                        break

                temp_ratio_count = 0

                if displayAttributes == True:

                    # Parse the attributes on ascending relevancy
                    for attr_key, attr_value in temp_ratio_store_ordered.items():

                        found = False

                        # Dont show if only one patent has this attribute
                        if from_this_node.node_root.node_attributes[attr_key] > 1:

                            # Parse children to calculate their Carry Through Ratio and compare it with the accepted Carry Through Ratio
                            for child in from_this_node.node_children:

                                if displayType == True:
                                    CarryThroughRatio = (100*round((child.node_attributes[attr_key]/self.__class__.tree_root.node_attributes[attr_key]),3)) / attr_value
                                else:
                                    CarryThroughRatio = child.node_attributes[attr_key] / from_this_node.node_attributes[attr_key] #(100*round((child.node_attributes[attr_key]/child.node_patents_number),3)) / attr_value

                                if CarryThroughRatio >= CarryThroughThreshold:
                                    found = True

                            # Check its parent
                            if displayType == True:
                                TraceBackAlpha = attr_value / (100*round((from_this_node.node_root.node_attributes[attr_key]/self.__class__.tree_root.node_attributes[attr_key]),3))
                            else:
                                TraceBackAlpha = from_this_node.node_attributes[attr_key] / from_this_node.node_root.node_attributes[attr_key] #attr_value / (100*round((from_this_node.node_root.node_attributes[attr_key]/from_this_node.node_root.node_patents_number),3))

                            TraceBackBeta  = from_this_node.node_patents_number / from_this_node.node_root.node_patents_number

                            if (TraceBackAlpha/TraceBackBeta) <= TraceBackTreshold:
                                found = True

                            if found == False:
                                if displayType == True:
                                    display_intensity_ratio = attr_value
                                else:
                                    display_intensity_ratio = 100*round((from_this_node.node_attributes[attr_key] / self.__class__.tree_root.node_attributes[attr_key]),3)

                                if temp_ratio_count > 0:
                                    output_node["name"] += "<tspan class='cat4'> - </tspan>"

                                if display_intensity_ratio >= 75:
                                    output_node["name"] += "<tspan class='cat1'>" + str(attr_key) + "</tspan>"
                                if display_intensity_ratio < 75 and display_intensity_ratio >= 50:
                                    output_node["name"] += "<tspan class='cat2'>" + str(attr_key) + "</tspan>"
                                if display_intensity_ratio < 50 and display_intensity_ratio >= 25:
                                    output_node["name"] += "<tspan class='cat3'>" + str(attr_key) + "</tspan>"
                                if display_intensity_ratio < 25 and display_intensity_ratio >= 0:
                                    output_node["name"] += "<tspan class='cat4'>" + str(attr_key) + "</tspan>"

                                temp_ratio_count += 1

                        if temp_ratio_count >= display_ratio_titles:
                            break

                output_node["description"] = category_attributes

        return output_node


"""--------------------------------------------------------------------------------------------------------------------
Setup
"""
# Create lists for patents and nodes
patent = []
node = []

#Create tree root node
node.append(Node("tree root", None, None, None, None, None))

"""--------------------------------------------------------------------------------------------------------------------
Import and iterate over de inputs
"""
#Import the data from the json file
data_file_input = open(inputfile).read()
input_data = json.loads(data_file_input)

print("Data imported from JSON: " + str(datetime.datetime.now()))

#Iterate over the patent entries
for item in input_data['Patents']:
    patent_id = item.get('id')
    patent_title = item.get('patent_title')
    patent_original_id = item.get('input_id')
    patent_company_id = item.get('company_id')
    patent_attributes = item['Attributes'][0]
    patent.append(Patent(patent_id, patent_title, patent_original_id, patent_company_id, patent_attributes))

print("Patents added to tree: " + str(datetime.datetime.now()))


"""--------------------------------------------------------------------------------------------------------------------
Write to JSON for visualization
"""
print("Start converting tree to output JSON: " + str(datetime.datetime.now()))
data_file_output = open(outputfile,mode='w+')
output_hash = node[0].create_json_structure()
print("Output JSON created: " + str(datetime.datetime.now()))
json.dump(output_hash,data_file_output,indent=2,sort_keys=True)
print("End: " + str(datetime.datetime.now()))