__author__ = 'Kevin Kruijthoff' """-------------------------------------------------------------------------------------------------------------------- Author: Kevin Kruijthoff Date: 23 Jul 2015 File: Main_v1.py Version: 1.0 Includes: - Importing patent data from input JSON file • Patent attributes should have a boolean (0 or 1) value - Storing of the individual patents - Iterate the individual patents using the COBWEB* algorithm and store the categorical hierarchical relation in the nodes - * The COBWEB algorithm contains add to existing category, adding as new category, merging and splitting - Export hierarchy from the nodes and data from the patents to JSON output file Manual: Settings are explained in code, for complete documentation see Thesis appendix --------------------------------------------------------------------------------------------------------------------""" """-------------------------------------------------------------------------------------------------------------------- Software user defined settings """ # Input & Output file inputfile = 'The_name_of_the_indexed_patent_file.json' # The Indexed JSON file resulting from the text-mining software outputfile = 'Name_to_give_the_to_be_created_file.json' # JSON file to store the hierarchy, it will be created if it does not exist # COBWEB Operators (See Thesis for explanation of operators) - Adding to existing node is always used operator_new = True # Allow the addition of new categorical nodes operator_merge = True # Allow the merging of nodes operator_split = True # Allow the splitting of nodes # Visualization parameters displayAttributes = True # Do or do not include the visualization of attributes at nodes displayType = True # True = relative / False = absolute display_ratio_number = 10 # Used in the mouse hover in the visualization display_ratio_titles = 3 # Number of attributes shown at the node in visualization # Display thresholds for CTR and TBR (See Thesis for explanation) CarryThroughThreshold = 0.95 # Equal to or larger than TraceBackTreshold = 1.05 # Equal to or smaller than """-------------------------------------------------------------------------------------------------------------------- Imports """ import json import sys import datetime from collections import OrderedDict """-------------------------------------------------------------------------------------------------------------------- Software debug settings """ d = datetime.datetime.now() # Determine the starting time of the software run print("start: " + str(d)) # Print the starting time at the start of the run sys.setrecursionlimit(1800) # Allowable recursion depth """-------------------------------------------------------------------------------------------------------------------- Class that stores patents """ class Patent(): patent_id_counter = 0 patents_total_counts = {} def __init__(self, input_id, input_patent_title, input_patent_original_id, input_company_id, input_attribute_counts): self.patent_unique_id = self.new_id() self.patent_input_id = input_id self.patent_input_title = input_patent_title self.patent_input_patent_original_id = input_patent_original_id self.patent_input_company_id = input_company_id self.patent_node_id = None self.patent_attributes = input_attribute_counts for patent_attribute, patent_attribute_value in self.patent_attributes.items(): if patent_attribute in self.__class__.patents_total_counts: self.__class__.patents_total_counts[patent_attribute] = self.__class__.patents_total_counts[ patent_attribute] + patent_attribute_value else: self.__class__.patents_total_counts[patent_attribute] = patent_attribute_value node.append(Node("new", self, None, None, None, None)) def new_id(self): new_patent_id = self.__class__.patent_id_counter self.__class__.patent_id_counter += 1 return new_patent_id """-------------------------------------------------------------------------------------------------------------------- Class that stores nodes """ class Node(): node_id_counter = 0 tree_root = None patents_number = 0 patents_total_counts = {} def __init__(self, node_type, new_patent_object, add_node_root, new_attributes, new_children, new_patent_count): self.node_unique_id = self.new_id() self.node_root = None self.node_children = [] self.node_attributes = {} self.node_patents = [] self.node_patents_number = 0 if node_type == "tree root": # Make the first node into the tree root else start categorization if self.__class__.tree_root == None: self.__class__.tree_root = self else: if node_type=="add node to concept" or node_type=="merge node": self.node_root = add_node_root self.node_root.node_children.append(self) if node_type=="add node to concept" or node_type=="new": self.node_patents_number += 1 self.node_patents.append(new_patent_object) for attr_key, attr_value in new_patent_object.patent_attributes.items(): if attr_key in self.node_attributes: self.node_attributes[attr_key] = self.node_attributes[attr_key] + attr_value else: self.node_attributes[attr_key] = attr_value if node_type == "new": self.__class__.patents_number += 1 self.cobweb(self.__class__.tree_root) if node_type == "merge node": for attr_key, attr_value in new_attributes.items(): self.node_attributes[attr_key] = attr_value self.node_children = new_children self.node_patents_number = new_patent_count for item in self.node_children: item.node_root = self def new_id(self): new_node_id = self.__class__.node_id_counter self.__class__.node_id_counter += 1 return new_node_id def cobweb(self, search_root): if not search_root.node_children: # add this node under the given root self.node_root = search_root search_root.node_children.append(self) self.add_attribute_counts(search_root, self.node_attributes) # If the root is the tree root do nothing and add the first leaf otherwise create new node and add leaf shifting down the patent if search_root.node_root != None: node.append(Node("add node to concept",search_root.node_patents[0],search_root, None, None, None)) search_root.node_patents = [] else: # check for adding to existing temp_total = {} temp_total_patent_number = 0 temp_count_children = 0 temp_best = None temp_second_best = None temp_best_cu = None temp_second_best_cu = None # add the counts of the root to the temporary hash for attr_key, attr_value in search_root.node_attributes.items(): if attr_key in temp_total: temp_total[attr_key] = temp_total[attr_key] + attr_value else: temp_total[attr_key] = attr_value # add the counts of the new patent to the temporary hash for attr_key, attr_value in self.node_attributes.items(): if attr_key in temp_total: temp_total[attr_key] = temp_total[attr_key] + attr_value else: temp_total[attr_key] = attr_value # count the number of patents in the children combined for x in search_root.node_children: temp_total_patent_number += x.node_patents_number temp_count_children += 1 # add one patent as the virtual patent temp_total_patent_number += 1 # calculate the expected number of correct guesses with no knowledge - equal for all children categories expected_correct_guesses = 0 for attr_key, attr_value in temp_total.items(): expected_correct_guesses += (attr_value / temp_total_patent_number) * (attr_value / temp_total_patent_number) expected_correct_guesses += (1 - (attr_value / temp_total_patent_number)) * (1 - (attr_value / temp_total_patent_number)) # calculate the expected number of attribute values that can be correctly guessed for trybranch in search_root.node_children: temp_branch_sum = 0 for branch in search_root.node_children: temp_branch_counts = {} temp_branch_patent_number = 0 temp_branch_patent_number += branch.node_patents_number if trybranch == branch: temp_branch_patent_number += 1 temp_branch_probability = temp_branch_patent_number / temp_total_patent_number for attr_key, attr_value in branch.node_attributes.items(): temp_branch_counts[attr_key] = attr_value if trybranch == branch: for attr_key, attr_value in self.node_attributes.items(): temp_branch_counts[attr_key] += attr_value predicted_correct_guesses = 0 for attr_key, attr_value in temp_branch_counts.items(): predicted_correct_guesses += (attr_value / temp_branch_patent_number) * (attr_value / temp_branch_patent_number) predicted_correct_guesses += (1 - (attr_value / temp_branch_patent_number)) * (1 - (attr_value / temp_branch_patent_number)) temp_branch_sum += temp_branch_probability * (predicted_correct_guesses - expected_correct_guesses) temp_branch_cu = temp_branch_sum / temp_count_children if temp_best_cu == None: temp_best_cu = temp_branch_cu temp_best = trybranch else: if temp_branch_cu > temp_best_cu: temp_second_best_cu = temp_best_cu temp_best_cu = temp_branch_cu temp_second_best = temp_best temp_best = trybranch else: if temp_second_best_cu == None: temp_second_best_cu = temp_branch_cu temp_second_best = trybranch else: if temp_branch_cu > temp_best_cu: temp_second_best_cu = temp_branch_cu temp_second_best = trybranch # check for adding new temp_new_branch_probability = 0 temp_new_branch_probability += 1/temp_total_patent_number temp_new_branch_count_children = temp_count_children + 1 new_predicted_correct_guesses = 0 for attr_key, attr_value in self.node_attributes.items(): new_predicted_correct_guesses += attr_value * attr_value new_predicted_correct_guesses += (1 - attr_value) * (1 - attr_value) temp_new_sum = 0 temp_new_sum += temp_new_branch_probability * (new_predicted_correct_guesses - expected_correct_guesses) for branch in search_root.node_children: temp_branch_counts = {} temp_branch_patent_number = 0 temp_branch_patent_number += branch.node_patents_number temp_branch_probability = temp_branch_patent_number / temp_total_patent_number for attr_key, attr_value in branch.node_attributes.items(): temp_branch_counts[attr_key] = attr_value predicted_correct_guesses = 0 for attr_key, attr_value in temp_branch_counts.items(): predicted_correct_guesses += (attr_value / temp_branch_patent_number) * (attr_value / temp_branch_patent_number) predicted_correct_guesses += (1 - (attr_value / temp_branch_patent_number)) * (1 - (attr_value / temp_branch_patent_number)) temp_new_sum += temp_branch_probability * (predicted_correct_guesses - expected_correct_guesses) temp_new_cu = temp_new_sum / temp_new_branch_count_children # Check for merging if temp_best != None and temp_second_best != None: merging_possible = True temp_merge_branch_probability = 0 temp_merge_branch_patent_number = temp_best.node_patents_number + temp_second_best.node_patents_number + 1 temp_merge_branch_probability += temp_merge_branch_patent_number/temp_total_patent_number temp_merge_branch_count_children = temp_count_children - 1 temp_merge_sum = 0 for branch in search_root.node_children: if branch != temp_best and branch != temp_second_best: temp_merge_attributes = {} for attr_key, attr_value in branch.node_attributes.items(): temp_merge_attributes[attr_key] = attr_value merge_predicted_correct_guesses = 0 for attr_key, attr_value in temp_merge_attributes.items(): merge_predicted_correct_guesses += (attr_value / temp_merge_branch_patent_number) * (attr_value / temp_merge_branch_patent_number) merge_predicted_correct_guesses += (1 - (attr_value / temp_merge_branch_patent_number)) * (1 - (attr_value / temp_merge_branch_patent_number)) temp_merge_sum += (branch.node_patents_number/temp_total_patent_number) * (merge_predicted_correct_guesses - expected_correct_guesses) temp_merge_attributes = {} for attr_key, attr_value in temp_best.node_attributes.items(): temp_merge_attributes[attr_key] = attr_value for attr_key, attr_value in temp_second_best.node_attributes.items(): temp_merge_attributes[attr_key] += attr_value for attr_key, attr_value in self.node_attributes.items(): temp_merge_attributes[attr_key] += attr_value merge_predicted_correct_guesses = 0 for attr_key, attr_value in temp_merge_attributes.items(): merge_predicted_correct_guesses += (attr_value / temp_merge_branch_patent_number) * (attr_value / temp_merge_branch_patent_number) merge_predicted_correct_guesses += (1 - (attr_value / temp_merge_branch_patent_number)) * (1 - (attr_value / temp_merge_branch_patent_number)) temp_merge_sum += temp_merge_branch_probability * (merge_predicted_correct_guesses - expected_correct_guesses) temp_merge_cu = temp_merge_sum / temp_merge_branch_count_children else: merging_possible = False # Check for splitting if temp_best != None and temp_best.node_patents_number > 1: splitting_possible = True temp_split_branch_count_children = 0 temp_split_branch_count_children += temp_count_children temp_split_branch_count_children -= 1 for item in temp_best.node_children: temp_split_branch_count_children += 1 temp_split_best = None temp_split_best_cu = None temp_split_store_sum = 0 for branch in search_root.node_children: if branch != temp_best: temp_split_branch_patent_number = branch.node_patents_number temp_split_attributes = {} for attr_key, attr_value in branch.node_attributes.items(): temp_split_attributes[attr_key] = attr_value split_predicted_correct_guesses = 0 for attr_key, attr_value in temp_split_attributes.items(): split_predicted_correct_guesses += (attr_value / temp_split_branch_patent_number) * (attr_value / temp_split_branch_patent_number) split_predicted_correct_guesses += (1 - (attr_value / temp_split_branch_patent_number)) * (1 - (attr_value / temp_split_branch_patent_number)) temp_split_branch_probability = 0 temp_split_branch_probability += branch.node_patents_number/temp_total_patent_number temp_split_store_sum += temp_split_branch_probability * (split_predicted_correct_guesses - expected_correct_guesses) for trybranch in temp_best.node_children: temp_split_sum = 0 for branch in temp_best.node_children: temp_split_branch_patent_number = branch.node_patents_number temp_split_attributes = {} for attr_key, attr_value in branch.node_attributes.items(): temp_split_attributes[attr_key] = attr_value if branch == trybranch: temp_split_branch_patent_number += 1 for attr_key, attr_value in self.node_attributes.items(): temp_split_attributes[attr_key] += attr_value split_predicted_correct_guesses = 0 for attr_key, attr_value in temp_split_attributes.items(): split_predicted_correct_guesses += (attr_value / temp_split_branch_patent_number) * (attr_value / temp_split_branch_patent_number) split_predicted_correct_guesses += (1 - (attr_value / temp_split_branch_patent_number)) * (1 - (attr_value / temp_split_branch_patent_number)) temp_split_branch_patent_number = branch.node_patents_number if branch == trybranch: temp_split_branch_patent_number += 1 temp_split_branch_probability = 0 temp_split_branch_probability += temp_split_branch_patent_number/temp_total_patent_number temp_split_sum += temp_split_branch_probability * (split_predicted_correct_guesses - expected_correct_guesses) temp_split_cu = (temp_split_store_sum + temp_split_sum) / temp_split_branch_count_children if temp_split_best == None: temp_split_best = trybranch temp_split_best_cu = temp_split_cu else: if temp_split_cu > temp_split_best_cu: temp_split_best = trybranch temp_split_best_cu = temp_split_cu else: splitting_possible = False # Checks for best possible CU max_cu_value = temp_best_cu max_cu = "child" if (temp_new_cu >= max_cu_value) and (operator_new == True): max_cu_value = temp_new_cu max_cu = "new" if merging_possible and (temp_merge_cu > max_cu_value) and (operator_merge == True): max_cu_value = temp_merge_cu max_cu = "merge" if splitting_possible and (temp_split_best_cu > max_cu_value) and (operator_split == True): max_cu_value = temp_split_best_cu max_cu = "split" # Check if adding this node as a new category is better than putting it under an existing category if max_cu == "new": self.node_root = search_root search_root.node_children.append(self) self.add_attribute_counts(search_root, self.node_attributes) # Check if adding this node as a new category is better than putting it under an existing category elif max_cu == "merge": merge_patent_count = temp_best.node_patents_number + temp_second_best.node_patents_number merge_attributes = {} for attr_key, attr_value in temp_best.node_attributes.items(): merge_attributes[attr_key] = attr_value for attr_key, attr_value in temp_second_best.node_attributes.items(): merge_attributes[attr_key] += attr_value merge_children = [] merge_children.append(temp_best) merge_children.append(temp_second_best) node.append(Node("merge node", None, search_root, merge_attributes, merge_children, merge_patent_count)) search_root.node_children.remove(temp_best) search_root.node_children.remove(temp_second_best) self.cobweb(temp_best) # If splitting is the best option elif max_cu == "split": for item in temp_best.node_children: search_root.node_children.append(item) item.node_root = search_root search_root.node_children.remove(temp_best) self.cobweb(search_root) # Go down to most fitting branch elif max_cu == "child": self.cobweb(temp_best) # Catching problem else: print("This should never happen!") def add_attribute_counts(self, start_node, count_hash): start_node.node_patents_number += 1 for attr_key, attr_value in count_hash.items(): if attr_key in start_node.node_attributes: start_node.node_attributes[attr_key] = start_node.node_attributes[attr_key] + attr_value else: start_node.node_attributes[attr_key] = attr_value if start_node.node_root != None: self.add_attribute_counts(start_node.node_root, count_hash) def create_json_structure(self): output_json = self.loop_items(self) return output_json def loop_items(self, from_this_node): output_node = {} # Check if from_this_node is a leaf if not from_this_node.node_children: output_node["patent"] = from_this_node.node_patents[0].patent_input_id output_node["patent_original_id"] = from_this_node.node_patents[0].patent_input_patent_original_id output_node["patent_title"] = from_this_node.node_patents[0].patent_input_title output_node["company"] = from_this_node.node_patents[0].patent_input_company_id # If it is not a leaf run loop_items for all children else: kids = [] for item in from_this_node.node_children: # This nested loop will explore all the branches to the end kids.append(self.loop_items(item)) output_node["children"] = kids # Don't do for the tree root as it has all at 100% if from_this_node != self.__class__.tree_root: # Define the category characteristics for display category_attributes = "" category_attributes += "patents: " + str(from_this_node.node_patents_number) + " / " temp_ratio_store = {} for attr_key, attr_value in from_this_node.node_attributes.items(): if displayType == True: temp_ratio = from_this_node.node_attributes[attr_key] / self.__class__.tree_root.node_attributes[attr_key] else: temp_ratio = from_this_node.node_attributes[attr_key] / from_this_node.node_patents_number if round(temp_ratio,3) > 0: temp_ratio_store[attr_key] = (round(temp_ratio,3)*100) temp_ratio_count = 0 temp_ratio_store_ordered = OrderedDict(sorted(temp_ratio_store.items(), key=lambda kv: kv[1], reverse=True)) output_node["name"] = "" for attr_key, attr_value in temp_ratio_store_ordered.items(): category_attributes += str(attr_key) + "(" + str(round(attr_value,1)) + "% / P " + str(round((from_this_node.node_attributes[attr_key]/from_this_node.node_patents_number),2)) + "), " temp_ratio_count += 1 if temp_ratio_count >= display_ratio_number: break temp_ratio_count = 0 if displayAttributes == True: # Parse the attributes on ascending relevancy for attr_key, attr_value in temp_ratio_store_ordered.items(): found = False # Dont show if only one patent has this attribute if from_this_node.node_root.node_attributes[attr_key] > 1: # Parse children to calculate their Carry Through Ratio and compare it with the accepted Carry Through Ratio for child in from_this_node.node_children: if displayType == True: CarryThroughRatio = (100*round((child.node_attributes[attr_key]/self.__class__.tree_root.node_attributes[attr_key]),3)) / attr_value else: CarryThroughRatio = child.node_attributes[attr_key] / from_this_node.node_attributes[attr_key] #(100*round((child.node_attributes[attr_key]/child.node_patents_number),3)) / attr_value if CarryThroughRatio >= CarryThroughThreshold: found = True # Check its parent if displayType == True: TraceBackAlpha = attr_value / (100*round((from_this_node.node_root.node_attributes[attr_key]/self.__class__.tree_root.node_attributes[attr_key]),3)) else: TraceBackAlpha = from_this_node.node_attributes[attr_key] / from_this_node.node_root.node_attributes[attr_key] #attr_value / (100*round((from_this_node.node_root.node_attributes[attr_key]/from_this_node.node_root.node_patents_number),3)) TraceBackBeta = from_this_node.node_patents_number / from_this_node.node_root.node_patents_number if (TraceBackAlpha/TraceBackBeta) <= TraceBackTreshold: found = True if found == False: if displayType == True: display_intensity_ratio = attr_value else: display_intensity_ratio = 100*round((from_this_node.node_attributes[attr_key] / self.__class__.tree_root.node_attributes[attr_key]),3) if temp_ratio_count > 0: output_node["name"] += " - " if display_intensity_ratio >= 75: output_node["name"] += "" + str(attr_key) + "" if display_intensity_ratio < 75 and display_intensity_ratio >= 50: output_node["name"] += "" + str(attr_key) + "" if display_intensity_ratio < 50 and display_intensity_ratio >= 25: output_node["name"] += "" + str(attr_key) + "" if display_intensity_ratio < 25 and display_intensity_ratio >= 0: output_node["name"] += "" + str(attr_key) + "" temp_ratio_count += 1 if temp_ratio_count >= display_ratio_titles: break output_node["description"] = category_attributes return output_node """-------------------------------------------------------------------------------------------------------------------- Setup """ # Create lists for patents and nodes patent = [] node = [] #Create tree root node node.append(Node("tree root", None, None, None, None, None)) """-------------------------------------------------------------------------------------------------------------------- Import and iterate over de inputs """ #Import the data from the json file data_file_input = open(inputfile).read() input_data = json.loads(data_file_input) print("Data imported from JSON: " + str(datetime.datetime.now())) #Iterate over the patent entries for item in input_data['Patents']: patent_id = item.get('id') patent_title = item.get('patent_title') patent_original_id = item.get('input_id') patent_company_id = item.get('company_id') patent_attributes = item['Attributes'][0] patent.append(Patent(patent_id, patent_title, patent_original_id, patent_company_id, patent_attributes)) print("Patents added to tree: " + str(datetime.datetime.now())) """-------------------------------------------------------------------------------------------------------------------- Write to JSON for visualization """ print("Start converting tree to output JSON: " + str(datetime.datetime.now())) data_file_output = open(outputfile,mode='w+') output_hash = node[0].create_json_structure() print("Output JSON created: " + str(datetime.datetime.now())) json.dump(output_hash,data_file_output,indent=2,sort_keys=True) print("End: " + str(datetime.datetime.now()))