#!/bin/env python
from htmlentitydefs import entitydefs as ent
import string
# This module enable you to extract text from a certian HTML element
#
# by Ran Novitsky Nof, 2010
# ran.nof@gmail.com
#
# example of use:
# say we want to get the text in an element of type tag (e.g. 'div','a','span' etc.)
# who has an attribute key (e.g. "id","class","href" etc.) with a value of val
# for example in order to extract the text of a div element with id of textdiv from a file htmlfile.html:
#
# <html>
# <head>
# :
# </head>
# <body>
# :
# <div "id"="textdiv">I will not buy this <a href="spam">record</a> it is scratched. </div>
# :
# </body>
# </html>
#
# use:
# from htmlparser import Parser
# htmlfile='htmlfile.html'
# tag,key,val = ('div','id','textdiv')
# text=Parser(htmlfile).getText(tag,key,val)
# print(text)
#
class Element():
def __init__(self):
self.startTag = -1
self.endTag = -1
self.attrib = {}
self.keys = self.attrib.keys()
self.innerHTML = ''
self.tag = ''
self.start = -1
self.end = -1
class Parser():
def __init__(self,infile):
self._root = open(infile).read()
self._root = self._root[self._root.find('<body'):self._root.find('</body')]
self._root = self._root[self._root.find('>')+1:]
self.tags = set()
i,j=0,0
self.tagstarts = {}
self.tagends = {}
self.elements = []
while i<len(self._root):
i = self._root.find('<',i)
j = self._root.find('>',i)
if not j>i: break
tag = self._root[i+1:j].split()[0]
if tag.startswith('/'):
tag = tag[1:]
self.tagends[tag][-1-self.tagends[tag][::-1].index(None)]=((i,j))
else:
self.tags.add(tag)
if tag in self.tagstarts:
self.tagstarts[tag].append((i,j))
self.tagends[tag].append(None)
else:
self.tagstarts[tag]=[(i,j)]
self.tagends[tag]=[None]
i=j+1
self.getElements()
def getElements(self):
for tag in self.tags:
if not tag in ['img']:
for i in range(len(self.tagstarts[tag])):
element = Element()
element.startTag = self.tagstarts[tag][i][0]
element.endTag = self.tagstarts[tag][i][1]
tagData = self._root[element.startTag+1:element.endTag].replace("\"","")
element.tag = tag
element.attrib=dict([a.split('=') for a in tagData.split() if '=' in a])
element.start = element.startTag
element.end = self.tagends[tag][i][1]
element.innerHTML = (self.tagstarts[tag][i][1]+1,self.tagends[tag][i][0])
self.elements.append(element)
def getText(self,tag,key,val):
element = [element for element in self.elements if element.tag==tag and element.attrib[key]==val]
if len(element):
element = element[0]
start,end = element.innerHTML
text = self._root[start:end]
i,j=0,0
while i<len(text):
i = text.find('<',i)
j = text.find('>',i)
if i<0 or j<0: break
text = text[:i]+text[j+1:]
i,j=-1,-1
while i<len(text):
i = text.find('&',i+1)
j = text.find(';',i+1)
if i<0 or j<0: break
if text[i+1:j] in ent.keys(): text = text[:i]+ent[text[i+1:j]]+text[j+1:]
else:
text=None
return text
Note the code does not check if the html code is correct. it also work only for the body part, and ignore img tags.from htmlentitydefs import entitydefs as ent
import string
# This module enable you to extract text from a certian HTML element
#
# by Ran Novitsky Nof, 2010
# ran.nof@gmail.com
#
# example of use:
# say we want to get the text in an element of type tag (e.g. 'div','a','span' etc.)
# who has an attribute key (e.g. "id","class","href" etc.) with a value of val
# for example in order to extract the text of a div element with id of textdiv from a file htmlfile.html:
#
# <html>
# <head>
# :
# </head>
# <body>
# :
# <div "id"="textdiv">I will not buy this <a href="spam">record</a> it is scratched. </div>
# :
# </body>
# </html>
#
# use:
# from htmlparser import Parser
# htmlfile='htmlfile.html'
# tag,key,val = ('div','id','textdiv')
# text=Parser(htmlfile).getText(tag,key,val)
# print(text)
#
class Element():
def __init__(self):
self.startTag = -1
self.endTag = -1
self.attrib = {}
self.keys = self.attrib.keys()
self.innerHTML = ''
self.tag = ''
self.start = -1
self.end = -1
class Parser():
def __init__(self,infile):
self._root = open(infile).read()
self._root = self._root[self._root.find('<body'):self._root.find('</body')]
self._root = self._root[self._root.find('>')+1:]
self.tags = set()
i,j=0,0
self.tagstarts = {}
self.tagends = {}
self.elements = []
while i<len(self._root):
i = self._root.find('<',i)
j = self._root.find('>',i)
if not j>i: break
tag = self._root[i+1:j].split()[0]
if tag.startswith('/'):
tag = tag[1:]
self.tagends[tag][-1-self.tagends[tag][::-1].index(None)]=((i,j))
else:
self.tags.add(tag)
if tag in self.tagstarts:
self.tagstarts[tag].append((i,j))
self.tagends[tag].append(None)
else:
self.tagstarts[tag]=[(i,j)]
self.tagends[tag]=[None]
i=j+1
self.getElements()
def getElements(self):
for tag in self.tags:
if not tag in ['img']:
for i in range(len(self.tagstarts[tag])):
element = Element()
element.startTag = self.tagstarts[tag][i][0]
element.endTag = self.tagstarts[tag][i][1]
tagData = self._root[element.startTag+1:element.endTag].replace("\"","")
element.tag = tag
element.attrib=dict([a.split('=') for a in tagData.split() if '=' in a])
element.start = element.startTag
element.end = self.tagends[tag][i][1]
element.innerHTML = (self.tagstarts[tag][i][1]+1,self.tagends[tag][i][0])
self.elements.append(element)
def getText(self,tag,key,val):
element = [element for element in self.elements if element.tag==tag and element.attrib[key]==val]
if len(element):
element = element[0]
start,end = element.innerHTML
text = self._root[start:end]
i,j=0,0
while i<len(text):
i = text.find('<',i)
j = text.find('>',i)
if i<0 or j<0: break
text = text[:i]+text[j+1:]
i,j=-1,-1
while i<len(text):
i = text.find('&',i+1)
j = text.find(';',i+1)
if i<0 or j<0: break
if text[i+1:j] in ent.keys(): text = text[:i]+ent[text[i+1:j]]+text[j+1:]
else:
text=None
return text