Tuesday, September 28, 2010

HowTo Parse HTML text using python.

I needed to to get the text of a specific div elemnt in an html file. I tried to use python's standard library modules for markup processing such as htmllib etc. but I couldn't figure how to use them. I've created my own module just for getting the text from an html element:

#!/bin/env python
from htmlentitydefs import entitydefs as ent
import string
# This module enable you to extract text from a certian HTML element
# by Ran Novitsky Nof, 2010
# ran.nof@gmail.com
# example of use:
# say we want to get the text in an element of type tag (e.g. 'div','a','span' etc.)
# who has an attribute key (e.g. "id","class","href" etc.) with a value of val
# for example in order to extract the text of a div element with id of textdiv from a file htmlfile.html:
# <html>
# <head>
#   :
# </head>
# <body>
#   :
# <div "id"="textdiv">I will not buy this <a href="spam">record</a> it is scratched. </div>
#   :
# </body>
# </html>
# use:
# from htmlparser import Parser
# htmlfile='htmlfile.html'
# tag,key,val = ('div','id','textdiv')
# text=Parser(htmlfile).getText(tag,key,val)
# print(text)
class Element():
  def __init__(self):
    self.startTag = -1
    self.endTag = -1
    self.attrib = {}
    self.keys = self.attrib.keys()
    self.innerHTML = ''
    self.tag = ''
    self.start = -1
    self.end = -1

class Parser():
  def __init__(self,infile):
    self._root = open(infile).read()
    self._root = self._root[self._root.find('<body'):self._root.find('</body')]
    self._root = self._root[self._root.find('>')+1:]
    self.tags = set()
    self.tagstarts = {}
    self.tagends = {}
    self.elements = []
    while i<len(self._root): 
      i = self._root.find('<',i)
      j = self._root.find('>',i)
      if not j>i: break
      tag = self._root[i+1:j].split()[0]     
      if tag.startswith('/'):
        tag = tag[1:]
        if tag in self.tagstarts:
  def getElements(self):
    for tag in self.tags:
      if not tag in ['img']:
        for i in range(len(self.tagstarts[tag])):
          element = Element()
          element.startTag = self.tagstarts[tag][i][0]
          element.endTag = self.tagstarts[tag][i][1]
          tagData = self._root[element.startTag+1:element.endTag].replace("\"","")
          element.tag = tag
          element.attrib=dict([a.split('=') for a in tagData.split() if '=' in a])
          element.start = element.startTag
          element.end = self.tagends[tag][i][1]
          element.innerHTML = (self.tagstarts[tag][i][1]+1,self.tagends[tag][i][0])
  def getText(self,tag,key,val):
    element = [element for element in self.elements if element.tag==tag and element.attrib[key]==val]
    if len(element):
      element = element[0]
      start,end = element.innerHTML
      text = self._root[start:end]
      while i<len(text):
        i  = text.find('<',i)
        j = text.find('>',i)
        if i<0 or j<0: break
        text = text[:i]+text[j+1:]
      while i<len(text):
        i  = text.find('&',i+1)
        j = text.find(';',i+1)
        if i<0 or j<0: break
        if text[i+1:j] in ent.keys(): text = text[:i]+ent[text[i+1:j]]+text[j+1:]
    return text
Note the code does not check if the html code is correct. it also work only for the body part, and ignore img tags.

No comments:

Post a Comment

Please Comment this Post or send me an Email