__author__ = 'Kevin Kruijthoff'

"""--------------------------------------------------------------------------------------------------------------------

Author:     Kevin Kruijthoff

Date:       23 Jul 2015

File:       ParseIndex_v1.py

Version:    1.0

Includes:   - Parsing patents from Derwent patent database in row structured format
            - Filter unwanted word-type words
            - Filter non-English words
            - Filter stopwords
            - Stemming of words
            - Determine attribute set
            - Index the patents with boolean attributes

Manual:     Settings are explained in code, for complete documentation see Thesis appendix

--------------------------------------------------------------------------------------------------------------------"""


"""--------------------------------------------------------------------------------------------------------------------
Software user defined settings
"""
inputfile = 'The_name_of_the_patent_database_file.txt'  # The file that contains the patents
outputfile = 'Name_to_give_the_to_be_created_file.json' # JSON file to store the indexed patents, it will be created if it does not exist

number_of_attributes = 750  # Set the top n of attributes to include from the count list

include_title    = False    # Include the patent title field in attributizing
include_abstract = False    # Include the patent abstract field in attributizing
include_nov      = True     # Include the patent claim field in attributizing

include_nouns       = True  # Allow nouns (e.g. advice, team, violin) to be included as attributes
include_verbs       = True  # Allow verbs (e.g. walking, wrote, eat) to be included as attributes
include_adjectives  = True  # Allow adjectives (e.g. melted, flat, small) to be included as attributes
include_adverbs     = True  # Allow adverbs (e.g. abruptly, outside, randomly) to be included as attributes


"""--------------------------------------------------------------------------------------------------------------------
Imports and settings
"""
# Imports
import json
from collections import OrderedDict
import re
import sys
import datetime
print("start: " + str(datetime.datetime.now()))
now = datetime.datetime.now()

# text processing related
import nltk
from nltk.tag import pos_tag
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

# Create list of word-types to be used for filtering
partofspeech = []
if include_nouns:
    partofspeech.extend(["NN","NNS","NNP","NNPS"])
if include_verbs:
    partofspeech.extend(["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"])
if include_adjectives:
    partofspeech.extend(["JJ", "JJR", "JJS"])
if include_adverbs:
    partofspeech.extend(["RB", "RBR", "RBS"])


"""--------------------------------------------------------------------------------------------------------------------
Patent object
"""
class Patent():
    patent_id_counter = 0
    patents_total_counts = {}
    patents_total_counts_ordered = {}

    def __init__(self, input_id, input_title, input_company_id, input_attribute_counts):
        self.patent_unique_id = self.new_id()
        self.patent_input_id = input_id
        self.patent_input_title = input_title
        self.patent_input_company_id = input_company_id
        self.patent_attributes = input_attribute_counts

        for patent_attribute, patent_attribute_value in self.patent_attributes.items():
            if patent_attribute in self.__class__.patents_total_counts:
                self.__class__.patents_total_counts[patent_attribute] += patent_attribute_value
            else:
                self.__class__.patents_total_counts[patent_attribute] = patent_attribute_value

    def new_id(self):
        new_patent_id = self.__class__.patent_id_counter
        self.__class__.patent_id_counter += 1
        return new_patent_id

"""--------------------------------------------------------------------------------------------------------------------
Import the stopwords
"""
listofstopwords = []
filesofstopwords = []
filesofstopwords.append(stopwords.words('english'))
filesofstopwords.append(stopwords.words('french'))
filesofstopwords.append(stopwords.words('german'))

for itemfile in filesofstopwords:
    for item in itemfile:
        listofstopwords.append(item)


"""--------------------------------------------------------------------------------------------------------------------
Import and parse data
"""
patent = []
countexclude = 0
hastid = False
hasabd = False
hasnov = False

with open(inputfile,'r',encoding='utf-8') as raw:

    for line in raw:

        if line[0:4] == "pn  ":
            patent_name = line.lstrip("pn  - ")
            patent_name = patent_name.rstrip()

        if line[0:4] == "ti  ":
            patent_title = line.lstrip("ti  - ")
            patent_title = patent_title.rstrip()

        if (line[0:5] == "abd  " and include_abstract == True) or (line[0:5] == "tid  " and include_title == True) or (line[0:5] == "nov  " and include_nov == True):

            columnfield = line

            columnfield = columnfield.lstrip("abd  - ")
            columnfield = columnfield.lstrip("tid  - ")
            columnfield = columnfield.lstrip("nov  - ")

            columnfield = columnfield.rstrip()
            columnfield = columnfield.lower()

            if line[0:5] == "tid  " and len(columnfield) > 0:
                hastid = True
            if line[0:5] == "abd  " and len(columnfield) > 0:
                hasabd = True
            if line[0:5] == "nov  " and len(columnfield) > 0:
                hasnov = True

            attributes = {}

            # tokenize the abstract
            columnfieldtotokens = nltk.word_tokenize(columnfield)

            # determine the part-of-speech type of the individual words
            columnfield = nltk.pos_tag(columnfieldtotokens)

            # iterate over the words + part-of-speech combinations
            for itemtag in columnfield:

                item = itemtag[0]       # Contains the word
                itemtype = itemtag[1]   # Contains the part-of-speech type

                # Only allow words longer than 1 character
                if len(item) > 1:
                    # Check if type of word is to be included
                    if itemtype in partofspeech:
                        # Check if the word exists in the english wordnet dictionary
                        if wn.synsets(item, lang='eng'):
                            # Remove stopwords may it still be present
                            if listofstopwords.count(item) == 0:
                                # Stem the word to enter into the attributes if it is a verb
                                if itemtype[0:2] == "VB":
                                    item = SnowballStemmer("english", ignore_stopwords=True).stem(item)
                                # Enter word into attributes
                                if not item in attributes:
                                    attributes[item] = 1
                                else:
                                    attributes[item] += 1

        # Retrieve the company name
        if line[0:4] == "pa  ":
            patent_companies = line.lstrip("pa  - ")
            patent_companies = patent_companies.rstrip()
            parts = patent_companies.split(sep=' | ')
            patent_company = parts[0]

        # if at the end of a patent Create patent object
        if line[0:8] == "prd$5l  ":
            if (include_title == True and hastid == True) or (include_abstract == True and hasabd == True) or (include_nov == True and hasnov == True):
                patent.append(Patent(patent_name, patent_title, patent_company, attributes))
                hastid = False
                hasabd = False
                hasnov = False
            else:
                countexclude += 1
                print(str(countexclude) + " - " + str(patent_name) + " - excluded")


"""--------------------------------------------------------------------------------------------------------------------
Order all attributes and select the top X
"""
Patent.patents_total_counts_ordered = OrderedDict(sorted(Patent.patents_total_counts.items(), key=lambda kv: kv[1], reverse=True))
total_counts_ordered = {}

num_iterate = 0

for attr_key, attr_val in Patent.patents_total_counts_ordered.items():
        if num_iterate < number_of_attributes:
            total_counts_ordered[attr_key] = attr_val
        num_iterate += 1

print("----------------------------------")
print(str(number_of_attributes) + " most occurring words")
print("----------------------------------")
total_counts_ordered_print = OrderedDict(sorted(total_counts_ordered.items(), key=lambda kv: kv[1], reverse=True))
for attr_key, attr_val in total_counts_ordered_print.items():
        print('"' + attr_key + '" : ' + str(attr_val))


"""--------------------------------------------------------------------------------------------------------------------
Make ready for export
"""
data_hash = {}
data_hash_patents = []
data_hash_meta = []

for item in patent:

    local_attributes = {}
    for attr_key, attr_val in total_counts_ordered.items():
        if attr_key in item.patent_attributes:
            local_attributes[attr_key] = 1
        else:
            local_attributes[attr_key] = 0

    hash_patent = {}
    hash_patent["id"] = item.patent_unique_id
    hash_patent["patent_title"] = item.patent_input_title
    hash_patent["input_id"] = item.patent_input_id
    hash_patent["company_id"] = item.patent_input_company_id
    hash_patent["Attributes"] = [local_attributes]

    data_hash_patents.append(hash_patent)

hash_meta = {}
hash_meta["author"] = __author__
hash_meta["date"] = str(now.day)+"-"+str(now.month)+"-"+str(now.year)
hash_meta["time"] = str(now.hour)+":"+str(now.minute)
hash_meta["inputfile"] = inputfile
hash_meta["includetitle"] = include_title
hash_meta["includeabstract"] = include_abstract
hash_meta["numberofattributes"] = number_of_attributes
hash_meta["include_nouns"] = include_nouns
hash_meta["include_verbs"] = include_verbs
hash_meta["include_adjectives"] = include_adjectives
hash_meta["include_adverbs"] = include_adverbs
data_hash_meta.append(hash_meta)

data_hash["Metadata"] = data_hash_meta
data_hash["Patents"] = data_hash_patents

output_hash = data_hash

data_file_output = open(outputfile,mode='w+')
json.dump(output_hash,data_file_output,indent=2,sort_keys=False)

print("end: " + str(datetime.datetime.now()))