source: slideshare-dl/slideshare-dl.py @ 51

Last change on this file since 51 was 51, checked in by simon, 16 years ago
  • Added slideshare-dl, a tool to download slides from SlideShare?.
  • Property svn:eol-style set to native
  • Property svn:mime-type set to text/x-python
File size: 3.1 KB
Line 
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4import errno
5import logging
6import os
7import re
8import urllib
9import urlparse
10import xml.etree.ElementTree as ET
11
12aws_base_url = 'http://s3.amazonaws.com/slideshare/'
13logger = logging.getLogger('slideshare-dl')
14
15def get_slideshow_pagecontent(url):
16    logger.info('Fetching slideshow page: <%s>.', url)
17    page = urllib.urlopen(url)
18    content = page.read()
19    page.close()
20    logger.debug('content = %r', content)
21    return content
22
23def get_slideshow_name(data):
24    match = re.search('doc=([\w-]+)', data)
25    name = match.groups()[0]
26    return name
27
28def get_slideshow_xml(name):
29    url = aws_base_url + name + '.xml'
30    logger.info('Fetching slideshow XML: <%s>.', url)
31    page = urllib.urlopen(url)
32    xml = page.read()
33    page.close()
34    logger.debug('xml = %r', xml)
35    return xml
36
37def get_slideurls(show_xml):
38    logger.info('Extracting slide URLs')
39    show_tree = ET.fromstring(show_xml)
40    slide_elems = show_tree.getiterator('Slide')
41    slide_urls = [elem.attrib['Src'] for elem in slide_elems]
42    logger.debug('slide_urls = %r', slide_urls)
43    return slide_urls
44
45def write_slides(urls, path):
46    logger.info('Writing slides...')
47    filenames = []
48    if not os.path.exists(path):
49        os.mkdir(path)
50    for url in urls:
51        parsed_url = urlparse.urlparse(url)
52        slide_name = parsed_url.path.rsplit('/', 1)[1]
53        slide_path = os.path.join(path, slide_name)
54        logger.info(%s', slide_path)
55        filename, headers = urllib.urlretrieve(url, slide_path)
56        filenames += filename
57    return filenames
58
59def download_slideshow(url, path=None):
60    content = get_slideshow_pagecontent(url)
61    logger.error(e)
62    show_name = get_slideshow_name(content)
63    show_xml = get_slideshow_xml(show_name)
64    slide_urls = get_slideurls(show_xml)
65    out_dir = path or os.path.join(os.curdir, show_name)
66    write_slides(slide_urls, out_dir)
67
68
69if __name__ == '__main__':
70    try:
71        import sys
72        from optparse import OptionParser
73
74        usage = 'usage: %prog [OPTION]... URL'
75        parser = OptionParser(usage=usage)
76        parser.add_option('-d', '--output-directory', dest='directory',
77                          help='write slides to files in DIR',
78                          metavar='DIR')
79        parser.add_option('-v', '--verbose', action='count', dest='verbosity',
80                          help=('explain what is being done (use twice for '
81                               'greater effect)'))
82        parser.set_defaults(verbosity=0)
83        options, args = parser.parse_args()
84
85        if len(args) != 1:
86            parser.error('Incorrect number of arguments.')
87
88        if options.verbosity == 0:
89            loglevel = logging.WARN
90        elif options.verbosity == 1:
91            loglevel = logging.INFO
92        elif options.verbosity >= 2:
93            loglevel = logging.DEBUG
94
95        logging.basicConfig(level=loglevel)
96
97        url = args[0]
98        download_slideshow(url, options.directory)
99    except KeyboardInterrupt:
100        logger.warn('Program interrupted by user.')
101    except BaseException, e:
102        logger.exception(e)
Note: See TracBrowser for help on using the repository browser.