Python, Lxml - Access Text
I m currently a bit out of ideas, and I really hope that you can give me a hint: Its probably best to explain my question with a small piece of sample code: from lxml import etree
Solution 1:
You could just reimplement itertext()
function and insert special handlers for ul
, table
if necessary:
from lxml import html
def itertext(root, handlers=dict(ul=lambda el: (list(el.itertext()),
el.tail))):
if root.text:
yield root.text
for el in root:
yield from handlers.get(el.tag, itertext)(el)
if root.tail:
yield root.tail
print(list(itertext(html.fromstring(
"<b>text0<i>text1</i><ul><li>item1</li>"
"<li>item2</li></ul>text2<b/><b>sib</b>"))))
Output
['text0', 'text1', ['item1', 'item2'], 'text2', 'sib']
Note: yield from X
could be replaced by for x in X: yield x
on older than Python 3.3 versions.
To join adjacent strings:
def joinadj(iterable, join=' '.join):
adj = []
for item in iterable:
if isinstance(item, str):
adj.append(item) # save for later
else:
if adj: # yield items accumulated so far
yield join(adj)
del adj[:] # remove yielded items
yield item # not a string, yield as is
if adj: # yield the rest
yield join(adj)
print(list(joinadj(itertext(html.fromstring(
"<b>text0<i>text1</i><ul><li>item1</li>"
"<li>item2</li></ul>text2<b/><b>sib</b>")))))
Output
['text0 text1', ['item1', 'item2'], 'text2 sib']
To allow tables, nested list in <ul>
the handler should call itertext()
recursively:
def ul_handler(el):
yield list(itertext(el, with_tail=False))
if el.tail:
yield el.tail
def itertext(root, handlers=dict(ul=ul_handler), with_tail=True):
if root.text:
yield root.text
for el in root:
yield from handlers.get(el.tag, itertext)(el)
if with_tail and root.tail:
yield root.tail
print(list(joinadj(itertext(html.fromstring(
"<b>text0<i>text1</i><ul><li>item1</li>"
"<li>item2<ul><li>sub1<li>sub2</li></ul></ul>"
"text2<b/><b>sib</b>")))))
Output
['text0 text1', ['item1', 'item2', ['sub1', 'sub2']], 'text2 sib']
Post a Comment for "Python, Lxml - Access Text"