1 | #!/usr/bin/env python |
---|
2 | # -*- coding: utf-8 -*- |
---|
3 | |
---|
4 | import errno |
---|
5 | import logging |
---|
6 | import os |
---|
7 | import re |
---|
8 | import urllib |
---|
9 | import urlparse |
---|
10 | import xml.etree.ElementTree as ET |
---|
11 | |
---|
12 | aws_base_url = 'http://s3.amazonaws.com/slideshare/' |
---|
13 | logger = logging.getLogger('slideshare-dl') |
---|
14 | |
---|
15 | def get_slideshow_pagecontent(url): |
---|
16 | logger.info('Fetching slideshow page: <%s>.', url) |
---|
17 | page = urllib.urlopen(url) |
---|
18 | content = page.read() |
---|
19 | page.close() |
---|
20 | logger.debug('content = %r', content) |
---|
21 | return content |
---|
22 | |
---|
23 | def get_slideshow_name(data): |
---|
24 | match = re.search('doc=([\w-]+)', data) |
---|
25 | name = match.groups()[0] |
---|
26 | return name |
---|
27 | |
---|
28 | def get_slideshow_xml(name): |
---|
29 | url = aws_base_url + name + '.xml' |
---|
30 | logger.info('Fetching slideshow XML: <%s>.', url) |
---|
31 | page = urllib.urlopen(url) |
---|
32 | xml = page.read() |
---|
33 | page.close() |
---|
34 | logger.debug('xml = %r', xml) |
---|
35 | return xml |
---|
36 | |
---|
37 | def get_slideurls(show_xml): |
---|
38 | logger.info('Extracting slide URLs') |
---|
39 | show_tree = ET.fromstring(show_xml) |
---|
40 | slide_elems = show_tree.getiterator('Slide') |
---|
41 | slide_urls = [elem.attrib['Src'] for elem in slide_elems] |
---|
42 | logger.debug('slide_urls = %r', slide_urls) |
---|
43 | return slide_urls |
---|
44 | |
---|
45 | def write_slides(urls, path): |
---|
46 | logger.info('Writing slides...') |
---|
47 | filenames = [] |
---|
48 | if not os.path.exists(path): |
---|
49 | os.mkdir(path) |
---|
50 | for url in urls: |
---|
51 | parsed_url = urlparse.urlparse(url) |
---|
52 | slide_name = parsed_url.path.rsplit('/', 1)[1] |
---|
53 | slide_path = os.path.join(path, slide_name) |
---|
54 | logger.info(' %s', slide_path) |
---|
55 | filename, headers = urllib.urlretrieve(url, slide_path) |
---|
56 | filenames += filename |
---|
57 | return filenames |
---|
58 | |
---|
59 | def download_slideshow(url, path=None): |
---|
60 | content = get_slideshow_pagecontent(url) |
---|
61 | show_name = get_slideshow_name(content) |
---|
62 | show_xml = get_slideshow_xml(show_name) |
---|
63 | slide_urls = get_slideurls(show_xml) |
---|
64 | out_dir = path or os.path.join(os.curdir, show_name) |
---|
65 | write_slides(slide_urls, out_dir) |
---|
66 | |
---|
67 | |
---|
68 | if __name__ == '__main__': |
---|
69 | try: |
---|
70 | import sys |
---|
71 | from optparse import OptionParser |
---|
72 | |
---|
73 | usage = 'usage: %prog [OPTION]... URL' |
---|
74 | parser = OptionParser(usage=usage) |
---|
75 | parser.add_option('-d', '--output-directory', dest='directory', |
---|
76 | help='write slides to files in DIR', |
---|
77 | metavar='DIR') |
---|
78 | parser.add_option('-v', '--verbose', action='count', dest='verbosity', |
---|
79 | help=('explain what is being done (use twice for ' |
---|
80 | 'greater effect)')) |
---|
81 | parser.set_defaults(verbosity=0) |
---|
82 | options, args = parser.parse_args() |
---|
83 | |
---|
84 | if len(args) != 1: |
---|
85 | parser.error('Incorrect number of arguments.') |
---|
86 | |
---|
87 | if options.verbosity == 0: |
---|
88 | loglevel = logging.WARN |
---|
89 | elif options.verbosity == 1: |
---|
90 | loglevel = logging.INFO |
---|
91 | elif options.verbosity >= 2: |
---|
92 | loglevel = logging.DEBUG |
---|
93 | |
---|
94 | logging.basicConfig(level=loglevel) |
---|
95 | |
---|
96 | url = args[0] |
---|
97 | download_slideshow(url, options.directory) |
---|
98 | except KeyboardInterrupt: |
---|
99 | logger.warn('Program interrupted by user.') |
---|
100 | except BaseException, e: |
---|
101 | logger.exception(e) |
---|