source: slideshare-dl/slideshare-dl.py

Last change on this file was 52, checked in by simon, 15 years ago

Remove spurious error log call.

  • Property svn:eol-style set to native
  • Property svn:mime-type set to text/x-python
File size: 3.1 KB
Line 
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4import errno
5import logging
6import os
7import re
8import urllib
9import urlparse
10import xml.etree.ElementTree as ET
11
12aws_base_url = 'http://s3.amazonaws.com/slideshare/'
13logger = logging.getLogger('slideshare-dl')
14
15def get_slideshow_pagecontent(url):
16    logger.info('Fetching slideshow page: <%s>.', url)
17    page = urllib.urlopen(url)
18    content = page.read()
19    page.close()
20    logger.debug('content = %r', content)
21    return content
22
23def get_slideshow_name(data):
24    match = re.search('doc=([\w-]+)', data)
25    name = match.groups()[0]
26    return name
27
28def get_slideshow_xml(name):
29    url = aws_base_url + name + '.xml'
30    logger.info('Fetching slideshow XML: <%s>.', url)
31    page = urllib.urlopen(url)
32    xml = page.read()
33    page.close()
34    logger.debug('xml = %r', xml)
35    return xml
36
37def get_slideurls(show_xml):
38    logger.info('Extracting slide URLs')
39    show_tree = ET.fromstring(show_xml)
40    slide_elems = show_tree.getiterator('Slide')
41    slide_urls = [elem.attrib['Src'] for elem in slide_elems]
42    logger.debug('slide_urls = %r', slide_urls)
43    return slide_urls
44
45def write_slides(urls, path):
46    logger.info('Writing slides...')
47    filenames = []
48    if not os.path.exists(path):
49        os.mkdir(path)
50    for url in urls:
51        parsed_url = urlparse.urlparse(url)
52        slide_name = parsed_url.path.rsplit('/', 1)[1]
53        slide_path = os.path.join(path, slide_name)
54        logger.info(%s', slide_path)
55        filename, headers = urllib.urlretrieve(url, slide_path)
56        filenames += filename
57    return filenames
58
59def download_slideshow(url, path=None):
60    content = get_slideshow_pagecontent(url)
61    show_name = get_slideshow_name(content)
62    show_xml = get_slideshow_xml(show_name)
63    slide_urls = get_slideurls(show_xml)
64    out_dir = path or os.path.join(os.curdir, show_name)
65    write_slides(slide_urls, out_dir)
66
67
68if __name__ == '__main__':
69    try:
70        import sys
71        from optparse import OptionParser
72
73        usage = 'usage: %prog [OPTION]... URL'
74        parser = OptionParser(usage=usage)
75        parser.add_option('-d', '--output-directory', dest='directory',
76                          help='write slides to files in DIR',
77                          metavar='DIR')
78        parser.add_option('-v', '--verbose', action='count', dest='verbosity',
79                          help=('explain what is being done (use twice for '
80                               'greater effect)'))
81        parser.set_defaults(verbosity=0)
82        options, args = parser.parse_args()
83
84        if len(args) != 1:
85            parser.error('Incorrect number of arguments.')
86
87        if options.verbosity == 0:
88            loglevel = logging.WARN
89        elif options.verbosity == 1:
90            loglevel = logging.INFO
91        elif options.verbosity >= 2:
92            loglevel = logging.DEBUG
93
94        logging.basicConfig(level=loglevel)
95
96        url = args[0]
97        download_slideshow(url, options.directory)
98    except KeyboardInterrupt:
99        logger.warn('Program interrupted by user.')
100    except BaseException, e:
101        logger.exception(e)
Note: See TracBrowser for help on using the repository browser.