Source code for secScraper.parser

import re
import numpy as np
import copy


[docs]class stage_2_parser(): """ Parser object. Acts on Stage 1 data. """ def __init__(self, s): self.s = s
[docs] def parse(self, parsed_report, verbose=False): """ Parse the text in a report. The text of each section will be placed in a different dict key. :param parsed_report: the text, as a giant str :param verbose: Increase the amount of printing to the terminal :return: dict containing the parsed report with all the text by section. Metadata is in '0' """ text = parsed_report['input'] text = text.lower() finds = [] if parsed_report['0']['type'] == '10-Q': # 1. Setup the giant regex to use to parse all potential sections in the report # 1.1. List of all possible titles titles = { '_i_1': 'financial statements', '_i_2': 'management s discussion and analysis of financial condition and results of operations', '_i_3': 'quantitative and qualitative disclosures about market risk', '_i_4': 'controls and procedures', 'ii_1': 'legal proceedings', 'ii_1a': 'risk factors', 'ii_2': 'unregistered sales of equity securities and use of proceeds', 'ii_3': 'defaults upon senior securities', 'ii_4': 'mine safety disclosures', 'ii_5': 'other information', 'ii_6': 'exhibits' } all_sections_10q = list(titles.keys()) # 1.2. Create the regex pattern = [] for key in titles: # Need to parse all potential sections in case they are present. prefix = r'([\n\r] ?| {2,})' # Is {3,} better? suffix = r'(?![a-z0-9\[\]\(\)])[\.\- ][ \n]*' # Because the item numbers are not unique, we NEED to include at least one word of the title regex = r'{}item {}{}{}'.format(prefix, key[3:], suffix, titles[key].split()[0]) # print(regex) pattern.append(regex) pattern = r'|'.join(pattern) pattern = re.compile(pattern) # 1.3. Apply the regex, single left to right pass res = {section: [] for section in titles} # Will contain all the parsed data for m in re.finditer(pattern, text): # All the magic happens here! last_word = re.findall(r'\w+$', m.group()) # Used to backcalculate the section. # Sanity checks if len(last_word) > 1: print(last_word) raise ValueError('[ERROR] There should not be more than one set of numbers in the matched title.') elif len(last_word) == 0: print(last_word) raise ValueError('[ERROR] This match could not be allocated') # Find the corresponding section corresponding_section = 0 for k, v in titles.items(): if v.split()[0] == last_word[0]: corresponding_section = k break if corresponding_section == 0: print(last_word) raise ValueError("[ERROR] Could not find where |{}| goes".format(last_word[0])) else: res[corresponding_section].append(m.span()) # II. Now we get serious. Purge the ToC of it exists # verbose=True # print(res) if verbose: print("[INFO] Before removing the toc:", res) original_res = copy.deepcopy(res) res = {k: v for k, v in res.items() if len(v)} # print(list(res.keys())) # Extract the Table of Content, if any # The gist is that if it exists, all the populated keys should follow each other in order # print(all_sections_10q) # Remove the sections that are empty so we can iterate over non-zero sections # Hypothesis: 1a is not optional - financial statements should not be # if I.1. has two entries and the second is after the 1st last entry -> toc! # then rm all [0] entries, then re-delete all zero entries # else no toc and do nothing full_sect = list(res.keys()) # Make sure you got something. If that is not the case, might just be a completely different template. try: assert len(full_sect) except: print("[ERROR] Here is full_sect: |{}|".format(full_sect)) print("[ERROR] Original res:", original_res) raise if len(res[full_sect[0]]) >= 2: if res[full_sect[-1]][0][1] < res[full_sect[0]][1][0]: # There is a toc! # print("[INFO] Found a ToC!") for v in res.values(): del v[0] # Remove all first sections res = {k: v for k, v in res.items() if len(v)} else: # print("[INFO] No ToC found") pass # Extra step: make sure the first elements go in increasing order. try: res = clean_first_markers(res) except Exception as e: print('[ERROR] {} in parser.clean_first_markers (10-Q)'.format(e)) print("This is the res\n", res) raise if verbose: print("[INFO] After removing the toc:", res) finds = [len(value) for value in res.values()] # Shrink the list of sections to review all_sections_10q = [k for k in all_sections_10q if k in res.keys()] # Extract the text for all the sections that we identified previous_start = 0 for idx in range(len(all_sections_10q)-1): if all_sections_10q[idx] in self.s['sections_to_parse_10q']: # Did we request to parse this section? start = 0 # used for the data extraction stop = 0 for span in res[all_sections_10q[idx]]: # Go through all the titles found, in order if span[1] > previous_start: # found a starting point start = span[1] for span_next in res[all_sections_10q[idx+1]]: # Same if span_next[0] > start: stop = span_next[0] break # Found a stopping point else: del res[all_sections_10q[idx+1]][idx] break # Found a starting point but not nessarily a stopping point! if start and stop: # assert stop > start parsed_report[all_sections_10q[idx]] = text[start:stop] else: raise ValueError('This start {} and stop {} combination is invalid for 10-Q section {}' .format(start, stop, all_sections_10q[idx])) previous_start = stop # Backward pass: if there are some sections that were expected and did not get populated # we populate them with a small statement. for section in self.s['sections_to_parse_10q']: try: assert len(parsed_report[section]) > 0 except KeyError: # print("[WARNING] Section {} was found to be empty.".format(section)) parsed_report[section] = "Nothing found for this section." except AssertionError: raise AssertionError("[ERROR] Why is that section filled with an empty text?") except: raise # Delete the input we used and return the result del parsed_report['input'] # DEBUG if 0: with open('/home/alex/test_10-q_{}.txt'.format(np.random.randint(1000)), 'w') as f: for section in parsed_report.keys(): if section != '0': f.write(section+'\n') f.write(parsed_report[section]) f.write('\n================================================================\n') f.write('\n==========================NEW SECTION===========================\n') f.write('\n================================================================\n') res = re.findall(regex, text) finds.append(len(res)) # print(finds) elif parsed_report['0']['type'] == '10-K': # 1. Setup the giant regex to use to parse all potential sections in the report # 1.1. List of all possible titles titles = { '1': 'business', '1a': 'risk factors', '1b': 'unresolved staff comments', '2': 'properties', '3': 'legal proceedings', '4': 'submission of matters to a vote of security holders', '5': 'market for registrant s common equity, related stockholder matters and issuer purchases of equity securities', '6': 'selected financial data', '7': 'management s discussion and analysis of financial condition and results of operations', '7a': 'quantitative and qualitative disclosures about market risk', '8': 'financial statements and supplementary data', '9': 'changes in and disagreements with accountants on accounting and financial disclosure', '9a': 'controls and procedures', '9b': 'other information', '10': 'directors executive officers and corporate governance', '11': 'executive compensation', '12': 'security ownership of certain beneficial owners and management and related stockholder matters', '13': 'certain relationships and related transactions, and director independence', '14': 'principal account(ant|ing) fees and services', '15': 'exhibits financial statement schedules' } all_sections_10k = list(titles.keys()) # 1.2. Create the regex pattern = [] for key in titles: prefix = r'([\n\r] ?| {2,})' # Is {3,} better? suffix = r'(?![a-z0-9\[\]\(\)])[\.\- ][ \n]*' regex = r'{}item {}{}{}'.format(prefix, key, suffix, titles[key].split()[0]) pattern.append(regex) pattern = r'|'.join(pattern) pattern = re.compile(pattern) # 1.3. Apply the regex, single left to right pass res = {section: [] for section in titles} for m in re.finditer(pattern, text): # All the magic happens here! # section_number_found = re.findall(sections_pattern_10k, m.group()) last_word = re.findall(r'\w+$', m.group()) # Used to backcalculate the section. # Sanity checks if len(last_word) > 1: print(last_word) raise ValueError('[ERROR] There should not be more than one set of numbers in the matched title.') elif len(last_word) == 0: print(m.group()) raise ValueError('[ERROR] This match: |{}| could not be allocated'.format(m.group())) # Find the corresponding section corresponding_section = 0 for k, v in titles.items(): if v.split()[0] == last_word[0]: corresponding_section = k break if corresponding_section == 0: print(last_word) raise ValueError("[ERROR] Could not find where |{}| goes".format(last_word[0])) else: res[corresponding_section].append(m.span()) # II. Now we get serious. Purge the ToC of it exists if verbose: print("[INFO] Before removing the toc:", res) original_res = copy.deepcopy(res) res = {k: v for k, v in res.items() if len(v)} # Extract the Table of Content, if any # The gist is that if it exists, all the populated keys should follow each other in order # print(all_sections_10q) # Remove the sections that are empty so we can iterate over non-zero sections # Hypothesis: 1a is not optional - financial statements should not be # if I.1. has two entries and the second is after the 1st last entry -> toc! # then rm all [0] entries, then re-delete all zero entries # else no toc and do nothing full_sect = list(res.keys()) # Make sure you got something. If that is not the case, might just be a completely different template. try: assert len(full_sect) except: print("[ERROR] Here is full_sect: |{}|".format(full_sect)) print("[ERROR] Original res:", original_res) raise if len(res[full_sect[0]]) >= 2: if res[full_sect[-1]][0][1] < res[full_sect[0]][1][0]: # There is a toc! # print("[INFO] Found a ToC!") for v in res.values(): # Iterate through all the sections del v[0] # Remove all first titles found - they are the ToC res = {k: v for k, v in res.items() if len(v)} else: # print("[INFO] No ToC found") pass # Extra step: make sure the first elements go in increasing order. try: res = clean_first_markers(res) except Exception as e: print('[ERROR] {} in parser.clean_first_markers (10-K)'.format(e)) print("This is the res\n", res) raise if verbose: print("[INFO] After removing the toc:", res) finds = [len(value) for value in res.values()] # print(finds) # Remove the sections that are empty so we can iterate over non-zero sections all_sections_10k = [k for k in all_sections_10k if k in res.keys()] # print(res) # Find the start & stop of each section previous_start = 0 for idx in range(len(all_sections_10k)-1): if all_sections_10k[idx] in self.s['sections_to_parse_10k']: # Did we request to parse this section? start = 0 # used for the data extraction stop = 0 for span in res[all_sections_10k[idx]]: # Go through all the titles found, in order if span[1] > previous_start: # found a starting point start = span[1] for span_next in res[all_sections_10k[idx+1]]: # Same if span_next[0] > start: stop = span_next[0] break # Found a stopping point break # Found a starting point but not nessarily a stopping point! if start and stop: # assert stop > start parsed_report[all_sections_10k[idx]] = text[start:stop] else: raise ValueError('This start {} and stop {} combination is invalid for 10-K section {}' .format(start, stop, all_sections_10k[idx])) previous_start = stop # Backward pass: if there are some for section in self.s['sections_to_parse_10k']: try: assert len(parsed_report[section]) > 0 except KeyError: # print("[WARNING] Section {} was found to be empty.".format(section)) parsed_report[section] = "Nothing found for this section." except AssertionError: raise AssertionError("[ERROR] Why is that section filled with an empty text?") except: raise # Delete the input we used and return the result del parsed_report['input'] # print(parsed_report.keys()) # print(parsed_report) # DEBUG if 0: with open('/home/alex/test_10-k_{}.txt'.format(np.random.randint(1000)), 'w') as f: for section in parsed_report.keys(): if section != '0': f.write(section+'\n') f.write(parsed_report[section]) f.write('\n================================================================\n') f.write('\n==========================NEW SECTION===========================\n') f.write('\n================================================================\n') else: raise ValueError('[ERROR] No stage 2 parser for report type {}!'.format(parsed_report['0']['type'])) if verbose: if len(list(set(finds))) != 1 or list(set(finds))[0] != 2: print("[WARNING] Issues parsing") # raise # Figure it out! return parsed_report
[docs]def clean_first_markers(res): """ In the event that a ToC was found, this will remove every first entry in the values of res. That means that all the location related to the titles in the ToC will be removed. :param res: dict, keys are sections to parse and contain the locations where the titles were found in the text. :return: Filtered version of res without the ToC locations """ # The goal is to layer the first marker in ascending order and remove early references to them. sections = list(res.keys()) # start = sections[0][1] for idx in range(len(sections)-1): counter_delete = 0 for markers in res[sections[idx+1]]: if markers[0] < res[sections[idx]][0][1]: counter_delete += 1 else: break for _ in range(counter_delete): del res[sections[idx+1]][0] return res