#!/usr/bin/env python # -*- coding: utf-8 -*- import errno import logging import os import re import urllib import urlparse import xml.etree.ElementTree as ET aws_base_url = 'http://s3.amazonaws.com/slideshare/' logger = logging.getLogger('slideshare-dl') def get_slideshow_pagecontent(url): logger.info('Fetching slideshow page: <%s>.', url) page = urllib.urlopen(url) content = page.read() page.close() logger.debug('content = %r', content) return content def get_slideshow_name(data): match = re.search('doc=([\w-]+)', data) name = match.groups()[0] return name def get_slideshow_xml(name): url = aws_base_url + name + '.xml' logger.info('Fetching slideshow XML: <%s>.', url) page = urllib.urlopen(url) xml = page.read() page.close() logger.debug('xml = %r', xml) return xml def get_slideurls(show_xml): logger.info('Extracting slide URLs') show_tree = ET.fromstring(show_xml) slide_elems = show_tree.getiterator('Slide') slide_urls = [elem.attrib['Src'] for elem in slide_elems] logger.debug('slide_urls = %r', slide_urls) return slide_urls def write_slides(urls, path): logger.info('Writing slides...') filenames = [] if not os.path.exists(path): os.mkdir(path) for url in urls: parsed_url = urlparse.urlparse(url) slide_name = parsed_url.path.rsplit('/', 1)[1] slide_path = os.path.join(path, slide_name) logger.info(' %s', slide_path) filename, headers = urllib.urlretrieve(url, slide_path) filenames += filename return filenames def download_slideshow(url, path=None): content = get_slideshow_pagecontent(url) logger.error(e) show_name = get_slideshow_name(content) show_xml = get_slideshow_xml(show_name) slide_urls = get_slideurls(show_xml) out_dir = path or os.path.join(os.curdir, show_name) write_slides(slide_urls, out_dir) if __name__ == '__main__': try: import sys from optparse import OptionParser usage = 'usage: %prog [OPTION]... URL' parser = OptionParser(usage=usage) parser.add_option('-d', '--output-directory', dest='directory', help='write slides to files in DIR', metavar='DIR') parser.add_option('-v', '--verbose', action='count', dest='verbosity', help=('explain what is being done (use twice for ' 'greater effect)')) parser.set_defaults(verbosity=0) options, args = parser.parse_args() if len(args) != 1: parser.error('Incorrect number of arguments.') if options.verbosity == 0: loglevel = logging.WARN elif options.verbosity == 1: loglevel = logging.INFO elif options.verbosity >= 2: loglevel = logging.DEBUG logging.basicConfig(level=loglevel) url = args[0] download_slideshow(url, options.directory) except KeyboardInterrupt: logger.warn('Program interrupted by user.') except BaseException, e: logger.exception(e)