#!/usr/bin/env python import os import sys import re import urllib def number_of_subpages(channel, page): html = urllib.urlopen("http://telegazeta.tvp.pl/telegazeta.php?page=%s&channel=%s" % (page, channel)).read().decode("iso8859-2") number = int(re.findall(r'\d+ / (\d+)', html)[0]) return number def subpages(channel, page): assert page >= 100 and page <= 999 group = (page / 100) * 100 for subpage in xrange(1, number_of_subpages(channel, page) + 1): yield (subpage, "http://www.telegazeta.tvp.pl/sync/ncexp/%s/%d/%03d_%04d.png" % (channel, group, page, subpage)) def pages(channel): for page in xrange(100, 999 + 1): for subpage in subpages(channel, page): yield (page,) + subpage def download_channel(channel): for page, subpage, url in pages(channel): print channel, page, subpage filename, headers = urllib.urlretrieve(url, "%s-%03d-%03d.png" % (channel, page, subpage)) if headers.type != "image/png": os.remove(filename) try: download_channel(sys.argv[1]) except IndexError: print 'Usage: "tgdl ". For instance: "tgdl TG1". Available channels are: TG1, TG2, SAT, KUL, SPO'