10 from bs4
import BeautifulSoup, NavigableString
13 import multiprocessing
32 def _HasClass(tag, *classes):
33 for c
in tag.get(
'class', []):
39 def _ParseSymbolPage(symbol_page_html, symbol_name):
40 """Parse symbol page and retrieve the include header defined in this page. 41 The symbol page provides header for the symbol, specifically in 42 "Defined in header <header>" section. An example: 44 <tr class="t-dsc-header"> 45 <td colspan="2"> <div>Defined in header <code><ratio></code> </div> 48 Returns a list of headers. 53 soup = BeautifulSoup(symbol_page_html,
"html.parser")
60 for table
in soup.select(
'table.t-dcl-begin, table.t-dsc-begin'):
63 for row
in table.select(
'tr'):
64 if _HasClass(row,
't-dcl',
't-dsc'):
67 found_symbols = row.find(
'td').stripped_strings
68 if not symbol_name
in found_symbols:
70 headers.update(current_headers)
71 elif _HasClass(row,
't-dsc-header'):
78 if not "Defined in header " in row.text:
81 for header_code
in row.find_all(
"code"):
82 current_headers.append(header_code.text)
83 all_headers.add(header_code.text)
85 return headers
or all_headers
88 def _ParseIndexPage(index_page_html):
90 The index page lists all std symbols and hrefs to their detailed pages 91 (which contain the defined header). An example: 93 <a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br> 94 <a href="acos.html" title="acos"><tt>acos()</tt></a> <br> 96 Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant). 99 soup = BeautifulSoup(index_page_html,
"html.parser")
100 for symbol_href
in soup.select(
"a[title]"):
105 caption = symbol_href.next_sibling
106 variant = isinstance(caption, NavigableString)
and "(" in caption
107 symbol_tt = symbol_href.find(
"tt")
109 symbols.append((symbol_tt.text.rstrip(
"<>()"),
110 symbol_href[
"href"], variant))
114 def _ReadSymbolPage(path, name):
115 with open(path)
as f:
116 return _ParseSymbolPage(f.read(), name)
119 def _GetSymbols(pool, root_dir, index_page_name, namespace):
120 """Get all symbols listed in the index page. All symbols should be in the 123 Returns a list of Symbols. 131 index_page_path = os.path.join(root_dir, index_page_name)
132 with open(index_page_path,
"r") as f: 135 for symbol_name, symbol_page_path, variant
in _ParseIndexPage(f.read()):
140 path = os.path.join(root_dir, symbol_page_path)
141 results.append((symbol_name,
142 pool.apply_async(_ReadSymbolPage, (path, symbol_name))))
145 symbol_headers = collections.defaultdict(set)
146 for symbol_name, lazy_headers
in results:
147 symbol_headers[symbol_name].update(lazy_headers.get())
150 for name, headers
in sorted(symbol_headers.items(), key=
lambda t : t[0]):
151 symbols.append(
Symbol(name, namespace, list(headers)))
156 """Get all symbols by parsing the given pages. 159 parse_pages: a list of tuples (page_root_dir, index_page_name, namespace) 164 pool = multiprocessing.Pool(
165 initializer=
lambda: signal.signal(signal.SIGINT, signal.SIG_IGN))
167 for root_dir, page_name, namespace
in parse_pages:
168 symbols.extend(_GetSymbols(pool, root_dir, page_name, namespace))
def GetSymbols(parse_pages)
def __init__(self, name, namespace, headers)