__author__ = 'Kevin Kruijthoff' """-------------------------------------------------------------------------------------------------------------------- Author: Kevin Kruijthoff Date: 23 Jul 2015 File: ParseIndex_v1.py Version: 1.0 Includes: - Parsing patents from Derwent patent database in row structured format - Filter unwanted word-type words - Filter non-English words - Filter stopwords - Stemming of words - Determine attribute set - Index the patents with boolean attributes Manual: Settings are explained in code, for complete documentation see Thesis appendix --------------------------------------------------------------------------------------------------------------------""" """-------------------------------------------------------------------------------------------------------------------- Software user defined settings """ inputfile = 'The_name_of_the_patent_database_file.txt' # The file that contains the patents outputfile = 'Name_to_give_the_to_be_created_file.json' # JSON file to store the indexed patents, it will be created if it does not exist number_of_attributes = 750 # Set the top n of attributes to include from the count list include_title = False # Include the patent title field in attributizing include_abstract = False # Include the patent abstract field in attributizing include_nov = True # Include the patent claim field in attributizing include_nouns = True # Allow nouns (e.g. advice, team, violin) to be included as attributes include_verbs = True # Allow verbs (e.g. walking, wrote, eat) to be included as attributes include_adjectives = True # Allow adjectives (e.g. melted, flat, small) to be included as attributes include_adverbs = True # Allow adverbs (e.g. abruptly, outside, randomly) to be included as attributes """-------------------------------------------------------------------------------------------------------------------- Imports and settings """ # Imports import json from collections import OrderedDict import re import sys import datetime print("start: " + str(datetime.datetime.now())) now = datetime.datetime.now() # text processing related import nltk from nltk.tag import pos_tag from nltk.corpus import wordnet as wn from nltk.corpus import stopwords from nltk.stem.snowball import SnowballStemmer # Create list of word-types to be used for filtering partofspeech = [] if include_nouns: partofspeech.extend(["NN","NNS","NNP","NNPS"]) if include_verbs: partofspeech.extend(["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]) if include_adjectives: partofspeech.extend(["JJ", "JJR", "JJS"]) if include_adverbs: partofspeech.extend(["RB", "RBR", "RBS"]) """-------------------------------------------------------------------------------------------------------------------- Patent object """ class Patent(): patent_id_counter = 0 patents_total_counts = {} patents_total_counts_ordered = {} def __init__(self, input_id, input_title, input_company_id, input_attribute_counts): self.patent_unique_id = self.new_id() self.patent_input_id = input_id self.patent_input_title = input_title self.patent_input_company_id = input_company_id self.patent_attributes = input_attribute_counts for patent_attribute, patent_attribute_value in self.patent_attributes.items(): if patent_attribute in self.__class__.patents_total_counts: self.__class__.patents_total_counts[patent_attribute] += patent_attribute_value else: self.__class__.patents_total_counts[patent_attribute] = patent_attribute_value def new_id(self): new_patent_id = self.__class__.patent_id_counter self.__class__.patent_id_counter += 1 return new_patent_id """-------------------------------------------------------------------------------------------------------------------- Import the stopwords """ listofstopwords = [] filesofstopwords = [] filesofstopwords.append(stopwords.words('english')) filesofstopwords.append(stopwords.words('french')) filesofstopwords.append(stopwords.words('german')) for itemfile in filesofstopwords: for item in itemfile: listofstopwords.append(item) """-------------------------------------------------------------------------------------------------------------------- Import and parse data """ patent = [] countexclude = 0 hastid = False hasabd = False hasnov = False with open(inputfile,'r',encoding='utf-8') as raw: for line in raw: if line[0:4] == "pn ": patent_name = line.lstrip("pn - ") patent_name = patent_name.rstrip() if line[0:4] == "ti ": patent_title = line.lstrip("ti - ") patent_title = patent_title.rstrip() if (line[0:5] == "abd " and include_abstract == True) or (line[0:5] == "tid " and include_title == True) or (line[0:5] == "nov " and include_nov == True): columnfield = line columnfield = columnfield.lstrip("abd - ") columnfield = columnfield.lstrip("tid - ") columnfield = columnfield.lstrip("nov - ") columnfield = columnfield.rstrip() columnfield = columnfield.lower() if line[0:5] == "tid " and len(columnfield) > 0: hastid = True if line[0:5] == "abd " and len(columnfield) > 0: hasabd = True if line[0:5] == "nov " and len(columnfield) > 0: hasnov = True attributes = {} # tokenize the abstract columnfieldtotokens = nltk.word_tokenize(columnfield) # determine the part-of-speech type of the individual words columnfield = nltk.pos_tag(columnfieldtotokens) # iterate over the words + part-of-speech combinations for itemtag in columnfield: item = itemtag[0] # Contains the word itemtype = itemtag[1] # Contains the part-of-speech type # Only allow words longer than 1 character if len(item) > 1: # Check if type of word is to be included if itemtype in partofspeech: # Check if the word exists in the english wordnet dictionary if wn.synsets(item, lang='eng'): # Remove stopwords may it still be present if listofstopwords.count(item) == 0: # Stem the word to enter into the attributes if it is a verb if itemtype[0:2] == "VB": item = SnowballStemmer("english", ignore_stopwords=True).stem(item) # Enter word into attributes if not item in attributes: attributes[item] = 1 else: attributes[item] += 1 # Retrieve the company name if line[0:4] == "pa ": patent_companies = line.lstrip("pa - ") patent_companies = patent_companies.rstrip() parts = patent_companies.split(sep=' | ') patent_company = parts[0] # if at the end of a patent Create patent object if line[0:8] == "prd$5l ": if (include_title == True and hastid == True) or (include_abstract == True and hasabd == True) or (include_nov == True and hasnov == True): patent.append(Patent(patent_name, patent_title, patent_company, attributes)) hastid = False hasabd = False hasnov = False else: countexclude += 1 print(str(countexclude) + " - " + str(patent_name) + " - excluded") """-------------------------------------------------------------------------------------------------------------------- Order all attributes and select the top X """ Patent.patents_total_counts_ordered = OrderedDict(sorted(Patent.patents_total_counts.items(), key=lambda kv: kv[1], reverse=True)) total_counts_ordered = {} num_iterate = 0 for attr_key, attr_val in Patent.patents_total_counts_ordered.items(): if num_iterate < number_of_attributes: total_counts_ordered[attr_key] = attr_val num_iterate += 1 print("----------------------------------") print(str(number_of_attributes) + " most occurring words") print("----------------------------------") total_counts_ordered_print = OrderedDict(sorted(total_counts_ordered.items(), key=lambda kv: kv[1], reverse=True)) for attr_key, attr_val in total_counts_ordered_print.items(): print('"' + attr_key + '" : ' + str(attr_val)) """-------------------------------------------------------------------------------------------------------------------- Make ready for export """ data_hash = {} data_hash_patents = [] data_hash_meta = [] for item in patent: local_attributes = {} for attr_key, attr_val in total_counts_ordered.items(): if attr_key in item.patent_attributes: local_attributes[attr_key] = 1 else: local_attributes[attr_key] = 0 hash_patent = {} hash_patent["id"] = item.patent_unique_id hash_patent["patent_title"] = item.patent_input_title hash_patent["input_id"] = item.patent_input_id hash_patent["company_id"] = item.patent_input_company_id hash_patent["Attributes"] = [local_attributes] data_hash_patents.append(hash_patent) hash_meta = {} hash_meta["author"] = __author__ hash_meta["date"] = str(now.day)+"-"+str(now.month)+"-"+str(now.year) hash_meta["time"] = str(now.hour)+":"+str(now.minute) hash_meta["inputfile"] = inputfile hash_meta["includetitle"] = include_title hash_meta["includeabstract"] = include_abstract hash_meta["numberofattributes"] = number_of_attributes hash_meta["include_nouns"] = include_nouns hash_meta["include_verbs"] = include_verbs hash_meta["include_adjectives"] = include_adjectives hash_meta["include_adverbs"] = include_adverbs data_hash_meta.append(hash_meta) data_hash["Metadata"] = data_hash_meta data_hash["Patents"] = data_hash_patents output_hash = data_hash data_file_output = open(outputfile,mode='w+') json.dump(output_hash,data_file_output,indent=2,sort_keys=False) print("end: " + str(datetime.datetime.now()))