#!/usr/bin/python
# -*- coding: ISO-8859-1 -*-
import urllib
import urllib2
import re
import HTMLParser
import threading
import Queue
import sys
try: import psyco ; psyco.jit() # If present, use psyco to accelerate the program
except: pass
THREAD_LIMIT = 50
packages_to_track = Queue.Queue(0)
parsed_results = Queue.Queue(THREAD_LIMIT)
class html2txt(HTMLParser.HTMLParser):
''' A basic parser which converts HTML tables into plain text.
Feed HTML with feed(). Get txt with getTXT(). (See example below.)
'''
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.TXT = '' # The CSV data
self.TXTrow = '' # The current CSV row beeing constructed from HTML
self.inTD = 0 # Used to track if we are inside or outside a <TD>...</TD> tag.
self.inTR = 0 # Used to track if we are inside or outside a <TR>...</TR> tag.
self.re_multiplespaces = re.compile('\s+') # regular expression used to remove spaces in excess
self.rowCount = 0 # CSV output line counter.
def handle_starttag(self, tag, attrs):
if tag == 'tr': self.start_tr()
elif tag == 'td': self.start_td()
def handle_endtag(self, tag):
if tag == 'tr': self.end_tr()
elif tag == 'td': self.end_td()
def start_tr(self):
if self.inTR: self.end_tr() # <TR> implies </TR>
self.inTR = 1
def end_tr(self):
if self.inTD: self.end_td() # </TR> implies </TD>
self.inTR = 0
if len(self.TXTrow) > 0:
self.TXT += self.TXTrow[:-1]
self.TXTrow = ''
self.TXT += '\n'
self.rowCount += 1
def start_td(self):
if not self.inTR: self.start_tr() # <TD> implies <TR>
self.inTD = 1
def end_td(self):
if self.inTD:
self.inTD = 0
def handle_data(self, data):
if self.inTD:
self.TXTrow += self.re_multiplespaces.sub(' ',data.replace('\t',' ').replace('\n','').replace('\r','').replace('"','""'))
def getTXT(self,purge=False):
''' Get output text.
If purge is true, getTXT() will return all remaining data,
even if <td> or <tr> are not properly closed.
(You would typically call getTXT with purge=True when you do not have
any more HTML to feed and you suspect dirty HTML (unclosed tags). '''
if purge and self.inTR: self.end_tr() # This will also end_td and append last TXT row to output text.
dataout = self.TXT[:]
self.TXT = ''
return dataout
def simple_request(tracking_number):
"""does a simple request to the tracking server"""
tracking_url = 'http://wwwapps.ups.com/WebTracking/processInputRequest'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
values = {'HTMLVersion' : '5.0',
'loc' : 'en_FR',
'Requester' : 'UPSHome',
'tracknum' : tracking_number,
'AgreeToTermsAndConditions' : 'yes',
'ignore' : '',
'track.x' : '13',
'track.y' : '11' }
data = urllib.urlencode(values)
req = urllib2.Request(tracking_url, data, headers)
try:
handle = urllib2.urlopen(req)
except IOError, e:
if hasattr(e, 'reason'):
ret = [ 'Server unreachable.', 'Raison: ', e.reason ]
elif hasattr(e, 'code'):
ret = [ 'Server Error.', 'Code d\' erreur : ', e.code ]
print e.code
return ret
else:
return handle.read()
def cut_html(html):
beginTag = "<!-- Begin: Exception Error Message -->"
endTag = "<!-- End Package Progress -->"
out = ""
startPos = html.find(beginTag)
while startPos > -1:
endPos = html.find(endTag,startPos+1)
if endTag == -1:
break
else:
out+=html[startPos+len(beginTag):endPos]
startPos = html.find(beginTag,endPos+1)
return out
def thread():
"""Get, cut and parse tracking information"""
while True:
try:
tracking_number = packages_to_track.get(False)
except Queue.Empty:
return
parser = html2txt()
parser.feed(cut_html(simple_request(tracking_number)))
parsed_results.put((tracking_number, parser.getTXT()), True)
def run():
jobs_done = 0
sys.stderr
sys.stderr.write("Starting...\n" )
for a in range(10):
for b in range(1000):
for c in range(10):
packages_to_track.put('1Z 1WX 989 68 855'+str(a)+' '+str(b).zfill(3)+' '+str(c))
sys.stderr.write("Created 100 000 job entries\n" )
for n in xrange(THREAD_LIMIT): # Unleash the hounds
t = threading.Thread(target=thread)
t.start()
sys.stderr.write("Unleashed the hounds...\n" )
while threading.activeCount()> 1 or not packages_to_track.empty():
# That condition means we want to do this loop if there are threads
# running OR there's stuff to process
try:
tracking_number, tracking_info = parsed_results.get(False, 10) # Wait for up to a second for a
# result
except Queue.Empty:
continue
jobs_done += 1
sys.stderr.write("Processed package #%s - Got %s bytes data - %s processed so far\n" % ( str(tracking_number), len(tracking_info), str(jobs_done) ) )
print tracking_number, '\n', tracking_info
if __name__ == '__main__':
run()