miniHTMLParser.py 1.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. #Licensed to the Apache Software Foundation (ASF) under one
  2. #or more contributor license agreements. See the NOTICE file
  3. #distributed with this work for additional information
  4. #regarding copyright ownership. The ASF licenses this file
  5. #to you under the Apache License, Version 2.0 (the
  6. #"License"); you may not use this file except in compliance
  7. #with the License. You may obtain a copy of the License at
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #Unless required by applicable law or agreed to in writing, software
  10. #distributed under the License is distributed on an "AS IS" BASIS,
  11. #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. #See the License for the specific language governing permissions and
  13. #limitations under the License.
  14. import urllib, urlparse, re
  15. from HTMLParser import HTMLParser
  16. class miniHTMLParser( HTMLParser ):
  17. viewedQueue = []
  18. instQueue = []
  19. def setBaseUrl(self, url):
  20. self.baseUrl = url
  21. def getNextLink( self ):
  22. if self.instQueue == []:
  23. return None
  24. else:
  25. return self.instQueue.pop(0)
  26. def handle_starttag( self, tag, attrs ):
  27. if tag == 'a':
  28. newstr = urlparse.urljoin(self.baseUrl, str(attrs[0][1]))
  29. if re.search('mailto', newstr) != None:
  30. return
  31. if (newstr in self.viewedQueue) == False:
  32. self.instQueue.append( newstr )
  33. self.viewedQueue.append( newstr )