__author__ = 'Kevin Kruijthoff'
"""--------------------------------------------------------------------------------------------------------------------
Author: Kevin Kruijthoff
Date: 23 Jul 2015
File: Main_v1.py
Version: 1.0
Includes: - Importing patent data from input JSON file
• Patent attributes should have a boolean (0 or 1) value
- Storing of the individual patents
- Iterate the individual patents using the COBWEB* algorithm and store the categorical hierarchical relation
in the nodes
- * The COBWEB algorithm contains add to existing category, adding as new category, merging and splitting
- Export hierarchy from the nodes and data from the patents to JSON output file
Manual: Settings are explained in code, for complete documentation see Thesis appendix
--------------------------------------------------------------------------------------------------------------------"""
"""--------------------------------------------------------------------------------------------------------------------
Software user defined settings
"""
# Input & Output file
inputfile = 'The_name_of_the_indexed_patent_file.json' # The Indexed JSON file resulting from the text-mining software
outputfile = 'Name_to_give_the_to_be_created_file.json' # JSON file to store the hierarchy, it will be created if it does not exist
# COBWEB Operators (See Thesis for explanation of operators) - Adding to existing node is always used
operator_new = True # Allow the addition of new categorical nodes
operator_merge = True # Allow the merging of nodes
operator_split = True # Allow the splitting of nodes
# Visualization parameters
displayAttributes = True # Do or do not include the visualization of attributes at nodes
displayType = True # True = relative / False = absolute
display_ratio_number = 10 # Used in the mouse hover in the visualization
display_ratio_titles = 3 # Number of attributes shown at the node in visualization
# Display thresholds for CTR and TBR (See Thesis for explanation)
CarryThroughThreshold = 0.95 # Equal to or larger than
TraceBackTreshold = 1.05 # Equal to or smaller than
"""--------------------------------------------------------------------------------------------------------------------
Imports
"""
import json
import sys
import datetime
from collections import OrderedDict
"""--------------------------------------------------------------------------------------------------------------------
Software debug settings
"""
d = datetime.datetime.now() # Determine the starting time of the software run
print("start: " + str(d)) # Print the starting time at the start of the run
sys.setrecursionlimit(1800) # Allowable recursion depth
"""--------------------------------------------------------------------------------------------------------------------
Class that stores patents
"""
class Patent():
patent_id_counter = 0
patents_total_counts = {}
def __init__(self, input_id, input_patent_title, input_patent_original_id, input_company_id, input_attribute_counts):
self.patent_unique_id = self.new_id()
self.patent_input_id = input_id
self.patent_input_title = input_patent_title
self.patent_input_patent_original_id = input_patent_original_id
self.patent_input_company_id = input_company_id
self.patent_node_id = None
self.patent_attributes = input_attribute_counts
for patent_attribute, patent_attribute_value in self.patent_attributes.items():
if patent_attribute in self.__class__.patents_total_counts:
self.__class__.patents_total_counts[patent_attribute] = self.__class__.patents_total_counts[
patent_attribute] + patent_attribute_value
else:
self.__class__.patents_total_counts[patent_attribute] = patent_attribute_value
node.append(Node("new", self, None, None, None, None))
def new_id(self):
new_patent_id = self.__class__.patent_id_counter
self.__class__.patent_id_counter += 1
return new_patent_id
"""--------------------------------------------------------------------------------------------------------------------
Class that stores nodes
"""
class Node():
node_id_counter = 0
tree_root = None
patents_number = 0
patents_total_counts = {}
def __init__(self, node_type, new_patent_object, add_node_root, new_attributes, new_children, new_patent_count):
self.node_unique_id = self.new_id()
self.node_root = None
self.node_children = []
self.node_attributes = {}
self.node_patents = []
self.node_patents_number = 0
if node_type == "tree root":
# Make the first node into the tree root else start categorization
if self.__class__.tree_root == None:
self.__class__.tree_root = self
else:
if node_type=="add node to concept" or node_type=="merge node":
self.node_root = add_node_root
self.node_root.node_children.append(self)
if node_type=="add node to concept" or node_type=="new":
self.node_patents_number += 1
self.node_patents.append(new_patent_object)
for attr_key, attr_value in new_patent_object.patent_attributes.items():
if attr_key in self.node_attributes:
self.node_attributes[attr_key] = self.node_attributes[attr_key] + attr_value
else:
self.node_attributes[attr_key] = attr_value
if node_type == "new":
self.__class__.patents_number += 1
self.cobweb(self.__class__.tree_root)
if node_type == "merge node":
for attr_key, attr_value in new_attributes.items():
self.node_attributes[attr_key] = attr_value
self.node_children = new_children
self.node_patents_number = new_patent_count
for item in self.node_children:
item.node_root = self
def new_id(self):
new_node_id = self.__class__.node_id_counter
self.__class__.node_id_counter += 1
return new_node_id
def cobweb(self, search_root):
if not search_root.node_children:
# add this node under the given root
self.node_root = search_root
search_root.node_children.append(self)
self.add_attribute_counts(search_root, self.node_attributes)
# If the root is the tree root do nothing and add the first leaf otherwise create new node and add leaf shifting down the patent
if search_root.node_root != None:
node.append(Node("add node to concept",search_root.node_patents[0],search_root, None, None, None))
search_root.node_patents = []
else:
# check for adding to existing
temp_total = {}
temp_total_patent_number = 0
temp_count_children = 0
temp_best = None
temp_second_best = None
temp_best_cu = None
temp_second_best_cu = None
# add the counts of the root to the temporary hash
for attr_key, attr_value in search_root.node_attributes.items():
if attr_key in temp_total:
temp_total[attr_key] = temp_total[attr_key] + attr_value
else:
temp_total[attr_key] = attr_value
# add the counts of the new patent to the temporary hash
for attr_key, attr_value in self.node_attributes.items():
if attr_key in temp_total:
temp_total[attr_key] = temp_total[attr_key] + attr_value
else:
temp_total[attr_key] = attr_value
# count the number of patents in the children combined
for x in search_root.node_children:
temp_total_patent_number += x.node_patents_number
temp_count_children += 1
# add one patent as the virtual patent
temp_total_patent_number += 1
# calculate the expected number of correct guesses with no knowledge - equal for all children categories
expected_correct_guesses = 0
for attr_key, attr_value in temp_total.items():
expected_correct_guesses += (attr_value / temp_total_patent_number) * (attr_value / temp_total_patent_number)
expected_correct_guesses += (1 - (attr_value / temp_total_patent_number)) * (1 - (attr_value / temp_total_patent_number))
# calculate the expected number of attribute values that can be correctly guessed
for trybranch in search_root.node_children:
temp_branch_sum = 0
for branch in search_root.node_children:
temp_branch_counts = {}
temp_branch_patent_number = 0
temp_branch_patent_number += branch.node_patents_number
if trybranch == branch:
temp_branch_patent_number += 1
temp_branch_probability = temp_branch_patent_number / temp_total_patent_number
for attr_key, attr_value in branch.node_attributes.items():
temp_branch_counts[attr_key] = attr_value
if trybranch == branch:
for attr_key, attr_value in self.node_attributes.items():
temp_branch_counts[attr_key] += attr_value
predicted_correct_guesses = 0
for attr_key, attr_value in temp_branch_counts.items():
predicted_correct_guesses += (attr_value / temp_branch_patent_number) * (attr_value / temp_branch_patent_number)
predicted_correct_guesses += (1 - (attr_value / temp_branch_patent_number)) * (1 - (attr_value / temp_branch_patent_number))
temp_branch_sum += temp_branch_probability * (predicted_correct_guesses - expected_correct_guesses)
temp_branch_cu = temp_branch_sum / temp_count_children
if temp_best_cu == None:
temp_best_cu = temp_branch_cu
temp_best = trybranch
else:
if temp_branch_cu > temp_best_cu:
temp_second_best_cu = temp_best_cu
temp_best_cu = temp_branch_cu
temp_second_best = temp_best
temp_best = trybranch
else:
if temp_second_best_cu == None:
temp_second_best_cu = temp_branch_cu
temp_second_best = trybranch
else:
if temp_branch_cu > temp_best_cu:
temp_second_best_cu = temp_branch_cu
temp_second_best = trybranch
# check for adding new
temp_new_branch_probability = 0
temp_new_branch_probability += 1/temp_total_patent_number
temp_new_branch_count_children = temp_count_children + 1
new_predicted_correct_guesses = 0
for attr_key, attr_value in self.node_attributes.items():
new_predicted_correct_guesses += attr_value * attr_value
new_predicted_correct_guesses += (1 - attr_value) * (1 - attr_value)
temp_new_sum = 0
temp_new_sum += temp_new_branch_probability * (new_predicted_correct_guesses - expected_correct_guesses)
for branch in search_root.node_children:
temp_branch_counts = {}
temp_branch_patent_number = 0
temp_branch_patent_number += branch.node_patents_number
temp_branch_probability = temp_branch_patent_number / temp_total_patent_number
for attr_key, attr_value in branch.node_attributes.items():
temp_branch_counts[attr_key] = attr_value
predicted_correct_guesses = 0
for attr_key, attr_value in temp_branch_counts.items():
predicted_correct_guesses += (attr_value / temp_branch_patent_number) * (attr_value / temp_branch_patent_number)
predicted_correct_guesses += (1 - (attr_value / temp_branch_patent_number)) * (1 - (attr_value / temp_branch_patent_number))
temp_new_sum += temp_branch_probability * (predicted_correct_guesses - expected_correct_guesses)
temp_new_cu = temp_new_sum / temp_new_branch_count_children
# Check for merging
if temp_best != None and temp_second_best != None:
merging_possible = True
temp_merge_branch_probability = 0
temp_merge_branch_patent_number = temp_best.node_patents_number + temp_second_best.node_patents_number + 1
temp_merge_branch_probability += temp_merge_branch_patent_number/temp_total_patent_number
temp_merge_branch_count_children = temp_count_children - 1
temp_merge_sum = 0
for branch in search_root.node_children:
if branch != temp_best and branch != temp_second_best:
temp_merge_attributes = {}
for attr_key, attr_value in branch.node_attributes.items():
temp_merge_attributes[attr_key] = attr_value
merge_predicted_correct_guesses = 0
for attr_key, attr_value in temp_merge_attributes.items():
merge_predicted_correct_guesses += (attr_value / temp_merge_branch_patent_number) * (attr_value / temp_merge_branch_patent_number)
merge_predicted_correct_guesses += (1 - (attr_value / temp_merge_branch_patent_number)) * (1 - (attr_value / temp_merge_branch_patent_number))
temp_merge_sum += (branch.node_patents_number/temp_total_patent_number) * (merge_predicted_correct_guesses - expected_correct_guesses)
temp_merge_attributes = {}
for attr_key, attr_value in temp_best.node_attributes.items():
temp_merge_attributes[attr_key] = attr_value
for attr_key, attr_value in temp_second_best.node_attributes.items():
temp_merge_attributes[attr_key] += attr_value
for attr_key, attr_value in self.node_attributes.items():
temp_merge_attributes[attr_key] += attr_value
merge_predicted_correct_guesses = 0
for attr_key, attr_value in temp_merge_attributes.items():
merge_predicted_correct_guesses += (attr_value / temp_merge_branch_patent_number) * (attr_value / temp_merge_branch_patent_number)
merge_predicted_correct_guesses += (1 - (attr_value / temp_merge_branch_patent_number)) * (1 - (attr_value / temp_merge_branch_patent_number))
temp_merge_sum += temp_merge_branch_probability * (merge_predicted_correct_guesses - expected_correct_guesses)
temp_merge_cu = temp_merge_sum / temp_merge_branch_count_children
else:
merging_possible = False
# Check for splitting
if temp_best != None and temp_best.node_patents_number > 1:
splitting_possible = True
temp_split_branch_count_children = 0
temp_split_branch_count_children += temp_count_children
temp_split_branch_count_children -= 1
for item in temp_best.node_children:
temp_split_branch_count_children += 1
temp_split_best = None
temp_split_best_cu = None
temp_split_store_sum = 0
for branch in search_root.node_children:
if branch != temp_best:
temp_split_branch_patent_number = branch.node_patents_number
temp_split_attributes = {}
for attr_key, attr_value in branch.node_attributes.items():
temp_split_attributes[attr_key] = attr_value
split_predicted_correct_guesses = 0
for attr_key, attr_value in temp_split_attributes.items():
split_predicted_correct_guesses += (attr_value / temp_split_branch_patent_number) * (attr_value / temp_split_branch_patent_number)
split_predicted_correct_guesses += (1 - (attr_value / temp_split_branch_patent_number)) * (1 - (attr_value / temp_split_branch_patent_number))
temp_split_branch_probability = 0
temp_split_branch_probability += branch.node_patents_number/temp_total_patent_number
temp_split_store_sum += temp_split_branch_probability * (split_predicted_correct_guesses - expected_correct_guesses)
for trybranch in temp_best.node_children:
temp_split_sum = 0
for branch in temp_best.node_children:
temp_split_branch_patent_number = branch.node_patents_number
temp_split_attributes = {}
for attr_key, attr_value in branch.node_attributes.items():
temp_split_attributes[attr_key] = attr_value
if branch == trybranch:
temp_split_branch_patent_number += 1
for attr_key, attr_value in self.node_attributes.items():
temp_split_attributes[attr_key] += attr_value
split_predicted_correct_guesses = 0
for attr_key, attr_value in temp_split_attributes.items():
split_predicted_correct_guesses += (attr_value / temp_split_branch_patent_number) * (attr_value / temp_split_branch_patent_number)
split_predicted_correct_guesses += (1 - (attr_value / temp_split_branch_patent_number)) * (1 - (attr_value / temp_split_branch_patent_number))
temp_split_branch_patent_number = branch.node_patents_number
if branch == trybranch:
temp_split_branch_patent_number += 1
temp_split_branch_probability = 0
temp_split_branch_probability += temp_split_branch_patent_number/temp_total_patent_number
temp_split_sum += temp_split_branch_probability * (split_predicted_correct_guesses - expected_correct_guesses)
temp_split_cu = (temp_split_store_sum + temp_split_sum) / temp_split_branch_count_children
if temp_split_best == None:
temp_split_best = trybranch
temp_split_best_cu = temp_split_cu
else:
if temp_split_cu > temp_split_best_cu:
temp_split_best = trybranch
temp_split_best_cu = temp_split_cu
else:
splitting_possible = False
# Checks for best possible CU
max_cu_value = temp_best_cu
max_cu = "child"
if (temp_new_cu >= max_cu_value) and (operator_new == True):
max_cu_value = temp_new_cu
max_cu = "new"
if merging_possible and (temp_merge_cu > max_cu_value) and (operator_merge == True):
max_cu_value = temp_merge_cu
max_cu = "merge"
if splitting_possible and (temp_split_best_cu > max_cu_value) and (operator_split == True):
max_cu_value = temp_split_best_cu
max_cu = "split"
# Check if adding this node as a new category is better than putting it under an existing category
if max_cu == "new":
self.node_root = search_root
search_root.node_children.append(self)
self.add_attribute_counts(search_root, self.node_attributes)
# Check if adding this node as a new category is better than putting it under an existing category
elif max_cu == "merge":
merge_patent_count = temp_best.node_patents_number + temp_second_best.node_patents_number
merge_attributes = {}
for attr_key, attr_value in temp_best.node_attributes.items():
merge_attributes[attr_key] = attr_value
for attr_key, attr_value in temp_second_best.node_attributes.items():
merge_attributes[attr_key] += attr_value
merge_children = []
merge_children.append(temp_best)
merge_children.append(temp_second_best)
node.append(Node("merge node", None, search_root, merge_attributes, merge_children, merge_patent_count))
search_root.node_children.remove(temp_best)
search_root.node_children.remove(temp_second_best)
self.cobweb(temp_best)
# If splitting is the best option
elif max_cu == "split":
for item in temp_best.node_children:
search_root.node_children.append(item)
item.node_root = search_root
search_root.node_children.remove(temp_best)
self.cobweb(search_root)
# Go down to most fitting branch
elif max_cu == "child":
self.cobweb(temp_best)
# Catching problem
else:
print("This should never happen!")
def add_attribute_counts(self, start_node, count_hash):
start_node.node_patents_number += 1
for attr_key, attr_value in count_hash.items():
if attr_key in start_node.node_attributes:
start_node.node_attributes[attr_key] = start_node.node_attributes[attr_key] + attr_value
else:
start_node.node_attributes[attr_key] = attr_value
if start_node.node_root != None:
self.add_attribute_counts(start_node.node_root, count_hash)
def create_json_structure(self):
output_json = self.loop_items(self)
return output_json
def loop_items(self, from_this_node):
output_node = {}
# Check if from_this_node is a leaf
if not from_this_node.node_children:
output_node["patent"] = from_this_node.node_patents[0].patent_input_id
output_node["patent_original_id"] = from_this_node.node_patents[0].patent_input_patent_original_id
output_node["patent_title"] = from_this_node.node_patents[0].patent_input_title
output_node["company"] = from_this_node.node_patents[0].patent_input_company_id
# If it is not a leaf run loop_items for all children
else:
kids = []
for item in from_this_node.node_children:
# This nested loop will explore all the branches to the end
kids.append(self.loop_items(item))
output_node["children"] = kids
# Don't do for the tree root as it has all at 100%
if from_this_node != self.__class__.tree_root:
# Define the category characteristics for display
category_attributes = ""
category_attributes += "patents: " + str(from_this_node.node_patents_number) + " / "
temp_ratio_store = {}
for attr_key, attr_value in from_this_node.node_attributes.items():
if displayType == True:
temp_ratio = from_this_node.node_attributes[attr_key] / self.__class__.tree_root.node_attributes[attr_key]
else:
temp_ratio = from_this_node.node_attributes[attr_key] / from_this_node.node_patents_number
if round(temp_ratio,3) > 0:
temp_ratio_store[attr_key] = (round(temp_ratio,3)*100)
temp_ratio_count = 0
temp_ratio_store_ordered = OrderedDict(sorted(temp_ratio_store.items(), key=lambda kv: kv[1], reverse=True))
output_node["name"] = ""
for attr_key, attr_value in temp_ratio_store_ordered.items():
category_attributes += str(attr_key) + "(" + str(round(attr_value,1)) + "% / P " + str(round((from_this_node.node_attributes[attr_key]/from_this_node.node_patents_number),2)) + "), "
temp_ratio_count += 1
if temp_ratio_count >= display_ratio_number:
break
temp_ratio_count = 0
if displayAttributes == True:
# Parse the attributes on ascending relevancy
for attr_key, attr_value in temp_ratio_store_ordered.items():
found = False
# Dont show if only one patent has this attribute
if from_this_node.node_root.node_attributes[attr_key] > 1:
# Parse children to calculate their Carry Through Ratio and compare it with the accepted Carry Through Ratio
for child in from_this_node.node_children:
if displayType == True:
CarryThroughRatio = (100*round((child.node_attributes[attr_key]/self.__class__.tree_root.node_attributes[attr_key]),3)) / attr_value
else:
CarryThroughRatio = child.node_attributes[attr_key] / from_this_node.node_attributes[attr_key] #(100*round((child.node_attributes[attr_key]/child.node_patents_number),3)) / attr_value
if CarryThroughRatio >= CarryThroughThreshold:
found = True
# Check its parent
if displayType == True:
TraceBackAlpha = attr_value / (100*round((from_this_node.node_root.node_attributes[attr_key]/self.__class__.tree_root.node_attributes[attr_key]),3))
else:
TraceBackAlpha = from_this_node.node_attributes[attr_key] / from_this_node.node_root.node_attributes[attr_key] #attr_value / (100*round((from_this_node.node_root.node_attributes[attr_key]/from_this_node.node_root.node_patents_number),3))
TraceBackBeta = from_this_node.node_patents_number / from_this_node.node_root.node_patents_number
if (TraceBackAlpha/TraceBackBeta) <= TraceBackTreshold:
found = True
if found == False:
if displayType == True:
display_intensity_ratio = attr_value
else:
display_intensity_ratio = 100*round((from_this_node.node_attributes[attr_key] / self.__class__.tree_root.node_attributes[attr_key]),3)
if temp_ratio_count > 0:
output_node["name"] += " - "
if display_intensity_ratio >= 75:
output_node["name"] += "" + str(attr_key) + ""
if display_intensity_ratio < 75 and display_intensity_ratio >= 50:
output_node["name"] += "" + str(attr_key) + ""
if display_intensity_ratio < 50 and display_intensity_ratio >= 25:
output_node["name"] += "" + str(attr_key) + ""
if display_intensity_ratio < 25 and display_intensity_ratio >= 0:
output_node["name"] += "" + str(attr_key) + ""
temp_ratio_count += 1
if temp_ratio_count >= display_ratio_titles:
break
output_node["description"] = category_attributes
return output_node
"""--------------------------------------------------------------------------------------------------------------------
Setup
"""
# Create lists for patents and nodes
patent = []
node = []
#Create tree root node
node.append(Node("tree root", None, None, None, None, None))
"""--------------------------------------------------------------------------------------------------------------------
Import and iterate over de inputs
"""
#Import the data from the json file
data_file_input = open(inputfile).read()
input_data = json.loads(data_file_input)
print("Data imported from JSON: " + str(datetime.datetime.now()))
#Iterate over the patent entries
for item in input_data['Patents']:
patent_id = item.get('id')
patent_title = item.get('patent_title')
patent_original_id = item.get('input_id')
patent_company_id = item.get('company_id')
patent_attributes = item['Attributes'][0]
patent.append(Patent(patent_id, patent_title, patent_original_id, patent_company_id, patent_attributes))
print("Patents added to tree: " + str(datetime.datetime.now()))
"""--------------------------------------------------------------------------------------------------------------------
Write to JSON for visualization
"""
print("Start converting tree to output JSON: " + str(datetime.datetime.now()))
data_file_output = open(outputfile,mode='w+')
output_hash = node[0].create_json_structure()
print("Output JSON created: " + str(datetime.datetime.now()))
json.dump(output_hash,data_file_output,indent=2,sort_keys=True)
print("End: " + str(datetime.datetime.now()))