#!/usr/bin/python # R(ange) URL D(ownloader) see usage() # version: see below -h option # author: Ludovic Bellier see http://home.zyrianes.net # licence: GNU General Public License see http://www.gnu.org/copyleft/gpl.html """ CVS Information $Id: rurld,v 0.11 2006/02/17 00:39:36 ludo Exp $ """ __version__='$Revision: 0.11 $'[11:-2] import getopt, sys, os, re, string class Config: dry_run=0 # if true, just echo urls, do not download rangeURL='' # url like with range indication with '[' and ']' rangeURLTemplate='bracket' class Rurld: def __init__(self,cfg): self.config=cfg def print_url(self,urls): if type(urls) == type([]): for url in urls: print url else: print urls def get_files(self,urls): str_url='' for url in urls: str_url=str_url+' %s '% url os.system('wget %s' % str_url) def actionForURLs(self,urls): """ download or print """ if self.config.dry_run: dl_f=self.print_url else: dl_f=self.get_files # do it map(dl_f,(urls,)) def parseRangeURL(self): sre_match=None template=self.config.rangeURLTemplate rurl=self.config.rangeURL base=rurl[0] if template == 'bracket': sre_match=re.match('(.*)\[(.*)-(.*)\](.*)',base) elif template == 'printf': # Warning: in progress... # ex: rurld -n -t printf http://home.zyrianes.net/p/basset%02d-%02d.jpg 1 2 # need to find how to directly render %02d-%02d.jpg 1 2 # sol 1: rewrite a bracket template url sre_match=re.match('(.*)%(.*)-%(.*)\.(.*)',base) # TBD not use '\.' but true %02d matching # print sre_match if sre_match: # import pdb; pdb.set_trace() # print sre_match.groups() base_rurl,range_left,range_right,end_url=sre_match.groups() params=rurl[1:] rng='%'+range_left+'-%'+range_right rng2=rng % tuple(map(int,params)) # for now, only %d, so 'int' cast, need a better solution self.config.rangeURL=[base_rurl+'['+rng2+'].'+end_url] # print self.config.rangeURL self.config.rangeURLTemplate='bracket' sre_match=self.parseRangeURL() return sre_match def buildUrls(self,sre_match): urls=[] if not sre_match: urls=[self.config.rangeURL[0]] else: base_url,range_left,range_right,end_url=sre_match.groups() # number of '0', and see zfill below (don't know how to # use variable notation like '%.(nb_zero)d' ) re_nb_zero=re.match('(0*)(.*)',range_left) if re_nb_zero: nb_zero=len(re_nb_zero.group(1))+1 else: nb_zero=0 for i in range(int(range_left), int(range_right)+1): urls.append("%s%s%s" % (base_url, string.zfill(i, nb_zero), end_url)) return urls def run(self): sre_match=self.parseRangeURL() urls=self.buildUrls(sre_match) self.actionForURLs(urls) def usage(): sys.stderr.write("""Range URL Downloader, version %s, gpl Usage: rurld [-h] [-n] [-t template] -h this help -n dry_run mode, just show urls -t template, template model for rangeURL: bracket|printf default value = bracket for example: rurld http://foo.com/pic[01-11].jpg downloads files from http://foo.com/pic01.jpg to http://foo.com/pic11.jpg note that [01-11] produces 01->11 and [1-11] 1->11 required: wget ( http://www.gnu.org/software/wget/wget.html ) """ % __version__ ) sys.exit(1) def main(args): config = Config() try: options, args = getopt.getopt(args[1:], 'hnt:') except: usage() for k, v in options: if k=='-h': usage() if k=='-n': config.dry_run=1; if k=='-t': config.rangeURLTemplate=v; if len(args)==0: usage() config.rangeURL= args rurld=Rurld(config) rurld.run() if __name__=='__main__': main(sys.argv)