import re
import numpy as np
import copy
[docs]class stage_2_parser():
"""
Parser object. Acts on Stage 1 data.
"""
def __init__(self, s):
self.s = s
[docs] def parse(self, parsed_report, verbose=False):
"""
Parse the text in a report. The text of each section will be placed in a different dict key.
:param parsed_report: the text, as a giant str
:param verbose: Increase the amount of printing to the terminal
:return: dict containing the parsed report with all the text by section. Metadata is in '0'
"""
text = parsed_report['input']
text = text.lower()
finds = []
if parsed_report['0']['type'] == '10-Q':
# 1. Setup the giant regex to use to parse all potential sections in the report
# 1.1. List of all possible titles
titles = {
'_i_1': 'financial statements',
'_i_2': 'management s discussion and analysis of financial condition and results of operations',
'_i_3': 'quantitative and qualitative disclosures about market risk',
'_i_4': 'controls and procedures',
'ii_1': 'legal proceedings',
'ii_1a': 'risk factors',
'ii_2': 'unregistered sales of equity securities and use of proceeds',
'ii_3': 'defaults upon senior securities',
'ii_4': 'mine safety disclosures',
'ii_5': 'other information',
'ii_6': 'exhibits'
}
all_sections_10q = list(titles.keys())
# 1.2. Create the regex
pattern = []
for key in titles:
# Need to parse all potential sections in case they are present.
prefix = r'([\n\r] ?| {2,})' # Is {3,} better?
suffix = r'(?![a-z0-9\[\]\(\)])[\.\- ][ \n]*'
# Because the item numbers are not unique, we NEED to include at least one word of the title
regex = r'{}item {}{}{}'.format(prefix, key[3:], suffix, titles[key].split()[0])
# print(regex)
pattern.append(regex)
pattern = r'|'.join(pattern)
pattern = re.compile(pattern)
# 1.3. Apply the regex, single left to right pass
res = {section: [] for section in titles} # Will contain all the parsed data
for m in re.finditer(pattern, text): # All the magic happens here!
last_word = re.findall(r'\w+$', m.group()) # Used to backcalculate the section.
# Sanity checks
if len(last_word) > 1:
print(last_word)
raise ValueError('[ERROR] There should not be more than one set of numbers in the matched title.')
elif len(last_word) == 0:
print(last_word)
raise ValueError('[ERROR] This match could not be allocated')
# Find the corresponding section
corresponding_section = 0
for k, v in titles.items():
if v.split()[0] == last_word[0]:
corresponding_section = k
break
if corresponding_section == 0:
print(last_word)
raise ValueError("[ERROR] Could not find where |{}| goes".format(last_word[0]))
else:
res[corresponding_section].append(m.span())
# II. Now we get serious. Purge the ToC of it exists
# verbose=True
# print(res)
if verbose:
print("[INFO] Before removing the toc:", res)
original_res = copy.deepcopy(res)
res = {k: v for k, v in res.items() if len(v)}
# print(list(res.keys()))
# Extract the Table of Content, if any
# The gist is that if it exists, all the populated keys should follow each other in order
# print(all_sections_10q)
# Remove the sections that are empty so we can iterate over non-zero sections
# Hypothesis: 1a is not optional - financial statements should not be
# if I.1. has two entries and the second is after the 1st last entry -> toc!
# then rm all [0] entries, then re-delete all zero entries
# else no toc and do nothing
full_sect = list(res.keys())
# Make sure you got something. If that is not the case, might just be a completely different template.
try:
assert len(full_sect)
except:
print("[ERROR] Here is full_sect: |{}|".format(full_sect))
print("[ERROR] Original res:", original_res)
raise
if len(res[full_sect[0]]) >= 2:
if res[full_sect[-1]][0][1] < res[full_sect[0]][1][0]:
# There is a toc!
# print("[INFO] Found a ToC!")
for v in res.values():
del v[0] # Remove all first sections
res = {k: v for k, v in res.items() if len(v)}
else:
# print("[INFO] No ToC found")
pass
# Extra step: make sure the first elements go in increasing order.
try:
res = clean_first_markers(res)
except Exception as e:
print('[ERROR] {} in parser.clean_first_markers (10-Q)'.format(e))
print("This is the res\n", res)
raise
if verbose:
print("[INFO] After removing the toc:", res)
finds = [len(value) for value in res.values()]
# Shrink the list of sections to review
all_sections_10q = [k for k in all_sections_10q if k in res.keys()]
# Extract the text for all the sections that we identified
previous_start = 0
for idx in range(len(all_sections_10q)-1):
if all_sections_10q[idx] in self.s['sections_to_parse_10q']: # Did we request to parse this section?
start = 0 # used for the data extraction
stop = 0
for span in res[all_sections_10q[idx]]: # Go through all the titles found, in order
if span[1] > previous_start: # found a starting point
start = span[1]
for span_next in res[all_sections_10q[idx+1]]: # Same
if span_next[0] > start:
stop = span_next[0]
break # Found a stopping point
else:
del res[all_sections_10q[idx+1]][idx]
break # Found a starting point but not nessarily a stopping point!
if start and stop: #
assert stop > start
parsed_report[all_sections_10q[idx]] = text[start:stop]
else:
raise ValueError('This start {} and stop {} combination is invalid for 10-Q section {}'
.format(start, stop, all_sections_10q[idx]))
previous_start = stop
# Backward pass: if there are some sections that were expected and did not get populated
# we populate them with a small statement.
for section in self.s['sections_to_parse_10q']:
try:
assert len(parsed_report[section]) > 0
except KeyError:
# print("[WARNING] Section {} was found to be empty.".format(section))
parsed_report[section] = "Nothing found for this section."
except AssertionError:
raise AssertionError("[ERROR] Why is that section filled with an empty text?")
except:
raise
# Delete the input we used and return the result
del parsed_report['input']
# DEBUG
if 0:
with open('/home/alex/test_10-q_{}.txt'.format(np.random.randint(1000)), 'w') as f:
for section in parsed_report.keys():
if section != '0':
f.write(section+'\n')
f.write(parsed_report[section])
f.write('\n================================================================\n')
f.write('\n==========================NEW SECTION===========================\n')
f.write('\n================================================================\n')
res = re.findall(regex, text)
finds.append(len(res))
# print(finds)
elif parsed_report['0']['type'] == '10-K':
# 1. Setup the giant regex to use to parse all potential sections in the report
# 1.1. List of all possible titles
titles = {
'1': 'business',
'1a': 'risk factors',
'1b': 'unresolved staff comments',
'2': 'properties',
'3': 'legal proceedings',
'4': 'submission of matters to a vote of security holders',
'5': 'market for registrant s common equity, related stockholder matters and issuer purchases of equity securities',
'6': 'selected financial data',
'7': 'management s discussion and analysis of financial condition and results of operations',
'7a': 'quantitative and qualitative disclosures about market risk',
'8': 'financial statements and supplementary data',
'9': 'changes in and disagreements with accountants on accounting and financial disclosure',
'9a': 'controls and procedures',
'9b': 'other information',
'10': 'directors executive officers and corporate governance',
'11': 'executive compensation',
'12': 'security ownership of certain beneficial owners and management and related stockholder matters',
'13': 'certain relationships and related transactions, and director independence',
'14': 'principal account(ant|ing) fees and services',
'15': 'exhibits financial statement schedules'
}
all_sections_10k = list(titles.keys())
# 1.2. Create the regex
pattern = []
for key in titles:
prefix = r'([\n\r] ?| {2,})' # Is {3,} better?
suffix = r'(?![a-z0-9\[\]\(\)])[\.\- ][ \n]*'
regex = r'{}item {}{}{}'.format(prefix, key, suffix, titles[key].split()[0])
pattern.append(regex)
pattern = r'|'.join(pattern)
pattern = re.compile(pattern)
# 1.3. Apply the regex, single left to right pass
res = {section: [] for section in titles}
for m in re.finditer(pattern, text): # All the magic happens here!
# section_number_found = re.findall(sections_pattern_10k, m.group())
last_word = re.findall(r'\w+$', m.group()) # Used to backcalculate the section.
# Sanity checks
if len(last_word) > 1:
print(last_word)
raise ValueError('[ERROR] There should not be more than one set of numbers in the matched title.')
elif len(last_word) == 0:
print(m.group())
raise ValueError('[ERROR] This match: |{}| could not be allocated'.format(m.group()))
# Find the corresponding section
corresponding_section = 0
for k, v in titles.items():
if v.split()[0] == last_word[0]:
corresponding_section = k
break
if corresponding_section == 0:
print(last_word)
raise ValueError("[ERROR] Could not find where |{}| goes".format(last_word[0]))
else:
res[corresponding_section].append(m.span())
# II. Now we get serious. Purge the ToC of it exists
if verbose:
print("[INFO] Before removing the toc:", res)
original_res = copy.deepcopy(res)
res = {k: v for k, v in res.items() if len(v)}
# Extract the Table of Content, if any
# The gist is that if it exists, all the populated keys should follow each other in order
# print(all_sections_10q)
# Remove the sections that are empty so we can iterate over non-zero sections
# Hypothesis: 1a is not optional - financial statements should not be
# if I.1. has two entries and the second is after the 1st last entry -> toc!
# then rm all [0] entries, then re-delete all zero entries
# else no toc and do nothing
full_sect = list(res.keys())
# Make sure you got something. If that is not the case, might just be a completely different template.
try:
assert len(full_sect)
except:
print("[ERROR] Here is full_sect: |{}|".format(full_sect))
print("[ERROR] Original res:", original_res)
raise
if len(res[full_sect[0]]) >= 2:
if res[full_sect[-1]][0][1] < res[full_sect[0]][1][0]:
# There is a toc!
# print("[INFO] Found a ToC!")
for v in res.values(): # Iterate through all the sections
del v[0] # Remove all first titles found - they are the ToC
res = {k: v for k, v in res.items() if len(v)}
else:
# print("[INFO] No ToC found")
pass
# Extra step: make sure the first elements go in increasing order.
try:
res = clean_first_markers(res)
except Exception as e:
print('[ERROR] {} in parser.clean_first_markers (10-K)'.format(e))
print("This is the res\n", res)
raise
if verbose:
print("[INFO] After removing the toc:", res)
finds = [len(value) for value in res.values()]
# print(finds)
# Remove the sections that are empty so we can iterate over non-zero sections
all_sections_10k = [k for k in all_sections_10k if k in res.keys()]
# print(res)
# Find the start & stop of each section
previous_start = 0
for idx in range(len(all_sections_10k)-1):
if all_sections_10k[idx] in self.s['sections_to_parse_10k']: # Did we request to parse this section?
start = 0 # used for the data extraction
stop = 0
for span in res[all_sections_10k[idx]]: # Go through all the titles found, in order
if span[1] > previous_start: # found a starting point
start = span[1]
for span_next in res[all_sections_10k[idx+1]]: # Same
if span_next[0] > start:
stop = span_next[0]
break # Found a stopping point
break # Found a starting point but not nessarily a stopping point!
if start and stop: #
assert stop > start
parsed_report[all_sections_10k[idx]] = text[start:stop]
else:
raise ValueError('This start {} and stop {} combination is invalid for 10-K section {}'
.format(start, stop, all_sections_10k[idx]))
previous_start = stop
# Backward pass: if there are some
for section in self.s['sections_to_parse_10k']:
try:
assert len(parsed_report[section]) > 0
except KeyError:
# print("[WARNING] Section {} was found to be empty.".format(section))
parsed_report[section] = "Nothing found for this section."
except AssertionError:
raise AssertionError("[ERROR] Why is that section filled with an empty text?")
except:
raise
# Delete the input we used and return the result
del parsed_report['input']
# print(parsed_report.keys())
# print(parsed_report)
# DEBUG
if 0:
with open('/home/alex/test_10-k_{}.txt'.format(np.random.randint(1000)), 'w') as f:
for section in parsed_report.keys():
if section != '0':
f.write(section+'\n')
f.write(parsed_report[section])
f.write('\n================================================================\n')
f.write('\n==========================NEW SECTION===========================\n')
f.write('\n================================================================\n')
else:
raise ValueError('[ERROR] No stage 2 parser for report type {}!'.format(parsed_report['0']['type']))
if verbose:
if len(list(set(finds))) != 1 or list(set(finds))[0] != 2:
print("[WARNING] Issues parsing")
# raise # Figure it out!
return parsed_report
[docs]def clean_first_markers(res):
"""
In the event that a ToC was found, this will remove every first entry in the values of res. That means that all the
location related to the titles in the ToC will be removed.
:param res: dict, keys are sections to parse and contain the locations where the titles were found in the text.
:return: Filtered version of res without the ToC locations
"""
# The goal is to layer the first marker in ascending order and remove early references to them.
sections = list(res.keys())
# start = sections[0][1]
for idx in range(len(sections)-1):
counter_delete = 0
for markers in res[sections[idx+1]]:
if markers[0] < res[sections[idx]][0][1]:
counter_delete += 1
else:
break
for _ in range(counter_delete):
del res[sections[idx+1]][0]
return res