[51] | 1 | #!/usr/bin/env python |
---|
| 2 | # -*- coding: utf-8 -*- |
---|
| 3 | |
---|
| 4 | import errno |
---|
| 5 | import logging |
---|
| 6 | import os |
---|
| 7 | import re |
---|
| 8 | import urllib |
---|
| 9 | import urlparse |
---|
| 10 | import xml.etree.ElementTree as ET |
---|
| 11 | |
---|
| 12 | aws_base_url = 'http://s3.amazonaws.com/slideshare/' |
---|
| 13 | logger = logging.getLogger('slideshare-dl') |
---|
| 14 | |
---|
| 15 | def get_slideshow_pagecontent(url): |
---|
| 16 | logger.info('Fetching slideshow page: <%s>.', url) |
---|
| 17 | page = urllib.urlopen(url) |
---|
| 18 | content = page.read() |
---|
| 19 | page.close() |
---|
| 20 | logger.debug('content = %r', content) |
---|
| 21 | return content |
---|
| 22 | |
---|
| 23 | def get_slideshow_name(data): |
---|
| 24 | match = re.search('doc=([\w-]+)', data) |
---|
| 25 | name = match.groups()[0] |
---|
| 26 | return name |
---|
| 27 | |
---|
| 28 | def get_slideshow_xml(name): |
---|
| 29 | url = aws_base_url + name + '.xml' |
---|
| 30 | logger.info('Fetching slideshow XML: <%s>.', url) |
---|
| 31 | page = urllib.urlopen(url) |
---|
| 32 | xml = page.read() |
---|
| 33 | page.close() |
---|
| 34 | logger.debug('xml = %r', xml) |
---|
| 35 | return xml |
---|
| 36 | |
---|
| 37 | def get_slideurls(show_xml): |
---|
| 38 | logger.info('Extracting slide URLs') |
---|
| 39 | show_tree = ET.fromstring(show_xml) |
---|
| 40 | slide_elems = show_tree.getiterator('Slide') |
---|
| 41 | slide_urls = [elem.attrib['Src'] for elem in slide_elems] |
---|
| 42 | logger.debug('slide_urls = %r', slide_urls) |
---|
| 43 | return slide_urls |
---|
| 44 | |
---|
| 45 | def write_slides(urls, path): |
---|
| 46 | logger.info('Writing slides...') |
---|
| 47 | filenames = [] |
---|
| 48 | if not os.path.exists(path): |
---|
| 49 | os.mkdir(path) |
---|
| 50 | for url in urls: |
---|
| 51 | parsed_url = urlparse.urlparse(url) |
---|
| 52 | slide_name = parsed_url.path.rsplit('/', 1)[1] |
---|
| 53 | slide_path = os.path.join(path, slide_name) |
---|
| 54 | logger.info(' %s', slide_path) |
---|
| 55 | filename, headers = urllib.urlretrieve(url, slide_path) |
---|
| 56 | filenames += filename |
---|
| 57 | return filenames |
---|
| 58 | |
---|
| 59 | def download_slideshow(url, path=None): |
---|
| 60 | content = get_slideshow_pagecontent(url) |
---|
| 61 | show_name = get_slideshow_name(content) |
---|
| 62 | show_xml = get_slideshow_xml(show_name) |
---|
| 63 | slide_urls = get_slideurls(show_xml) |
---|
| 64 | out_dir = path or os.path.join(os.curdir, show_name) |
---|
| 65 | write_slides(slide_urls, out_dir) |
---|
| 66 | |
---|
| 67 | |
---|
| 68 | if __name__ == '__main__': |
---|
| 69 | try: |
---|
| 70 | import sys |
---|
| 71 | from optparse import OptionParser |
---|
| 72 | |
---|
| 73 | usage = 'usage: %prog [OPTION]... URL' |
---|
| 74 | parser = OptionParser(usage=usage) |
---|
| 75 | parser.add_option('-d', '--output-directory', dest='directory', |
---|
| 76 | help='write slides to files in DIR', |
---|
| 77 | metavar='DIR') |
---|
| 78 | parser.add_option('-v', '--verbose', action='count', dest='verbosity', |
---|
| 79 | help=('explain what is being done (use twice for ' |
---|
| 80 | 'greater effect)')) |
---|
| 81 | parser.set_defaults(verbosity=0) |
---|
| 82 | options, args = parser.parse_args() |
---|
| 83 | |
---|
| 84 | if len(args) != 1: |
---|
| 85 | parser.error('Incorrect number of arguments.') |
---|
| 86 | |
---|
| 87 | if options.verbosity == 0: |
---|
| 88 | loglevel = logging.WARN |
---|
| 89 | elif options.verbosity == 1: |
---|
| 90 | loglevel = logging.INFO |
---|
| 91 | elif options.verbosity >= 2: |
---|
| 92 | loglevel = logging.DEBUG |
---|
| 93 | |
---|
| 94 | logging.basicConfig(level=loglevel) |
---|
| 95 | |
---|
| 96 | url = args[0] |
---|
| 97 | download_slideshow(url, options.directory) |
---|
| 98 | except KeyboardInterrupt: |
---|
| 99 | logger.warn('Program interrupted by user.') |
---|
| 100 | except BaseException, e: |
---|
| 101 | logger.exception(e) |
---|