Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Best Practices for Python Scripting

Greg Back
August 01, 2015

Best Practices for Python Scripting

The Python REPL is an amazing tool for exploring a problem and quickly finding an answer. But how do you turn that "quick and dirty" snippet into an elegant, reusable module? In this talk, you'll learn how to isolate input, output, configuration options and more from your core logic to transform that block of code into a package and script that behave well in command line environments.

Greg Back

August 01, 2015
Tweet

More Decks by Greg Back

Other Decks in Programming

Transcript

  1. Let’s play in the REPL Python 2.7.10 (default, Jun 10

    2015, 19:42:47) [GCC 4.2.1 Compatible Apple LLVM 6.1.0 (clang-602.0.53)] on darwin Type "help", "copyright", "credits" or "license" for more information. >>> with open('gettysburg.txt') as f: ... data = f.read() ... >>> import collections >>> words = collections.defaultdict(int) >>> for word in data.split(): ... word = word.strip('-,.').lower() ... words[word] += 1 ... >>> import operator >>> word_freqs = sorted(words.items(), key=operator.itemgetter(1), reverse=True) >>> with open('results.txt', 'w') as f: ... for x in word_freqs[:10]: ... f.write(str(x) + '\n') ... print(x) ...
  2. And the answer is... ('that', 13) ('the', 11) ('we', 10)

    ('to', 8) ('here', 8) ('a', 7) ('nation', 5) ('not', 5) ('for', 5) ('can', 5)
  3. Convert to a script $ cat wordfreq.py with open('gettysburg.txt') as

    f: data = f.read() import collections words = collections.defaultdict(int) for word in data.split(): word = word.strip('-,.').lower() words[word] += 1 import operator word_freqs = sorted(words.items(), key=operator.itemgetter(1), reverse=True) with open('results.txt', 'w') as f: for x in word_freqs[:10]: f.write(str(x) + '\n') print(x) python wordfreq.py
  4. #!/usr/bin/env python """ wordfreq.py - Count the frequency of words

    in text. """ import collections import operator # Read text with open('gettysburg.txt') as f: data = f.read() # Collect occurences of each word words = collections.defaultdict(int) for word in data.split(): word = word.strip('-,.').lower() words[word] += 1 # Sort the word data from most frequent to least frequent. word_freqs = sorted(words.items(), key=operator.itemgetter(1), reverse=True) # Output 10 most frequent words. with open('results.txt', 'w') as f: for x in word_freqs[:10]: f.write(str(x) + '\n') print(x) Clean up ./wordfreq.py python -c "import wordfreq"
  5. ... import collections import operator def main(): # Read text

    with open('gettysburg.txt') as f: data = f.read() # Collect occurences of each word words = collections.defaultdict(int) for word in data.split(): word = word.strip('-,.').lower() words[word] += 1 # Sort the word data from most frequent to least frequent. word_freqs = sorted(words.items(), key=operator.itemgetter(1), reverse=True) # Output 10 most frequent words. with open('results.txt', 'w') as f: for x in word_freqs[:10]: f.write(str(x) + '\n') print(x) main() Add main function
  6. ... def main(): ... # Output 10 most frequent words.

    with open('results.txt', 'w') as f: for x in word_freqs[:10]: f.write(str(x) + '\n') print(x) if __name__ == '__main__': main() “Protect” main function ./wordfreq.py python -c "import wordfreq"
  7. ... def read_text(): """Read text""" ... def count_words(data): """Count occurences

    of each word""" ... def sort_words(words): """Sort the word data from most frequent to least frequent.""" ... def output_results(word_freqs): """Output 10 most frequent words.""" ... def main(): """Main wordfreq function.""" data = read_text() words = count_words(data) word_freqs = sort_words(words) output_results(word_freqs) ... Split up logic
  8. ... def output_results(word_freqs, outstream): """Output 10 most frequent words.""" for

    x in word_freqs[:10]: outstream.write(str(x) + '\n') def main(): """Main wordfreq function.""" if len(sys.argv) < 2: data = sys.stdin.read() else: data = read_text(infile=sys.argv[1]) words = count_words(data) word_freqs = sort_words(words) if len(sys.argv) < 3: output_results(word_freqs, outstream=sys.stdout) else: with open(sys.argv[2], 'w') as outfile: output_results(word_freqs, outstream=outfile) ... Parameterize input and output files ./wordfreq.py gettysburg.txt results.txt cat alice.txt | ./wordfreq.py > alice_results.txt
  9. import argparse ... def parse_args(): """Parse arguments from the command

    line.""" parser = argparse.ArgumentParser("Count the frequency of words in text.") parser.add_argument('infile', nargs="?", help="Source text. Default: STDIN") parser.add_argument('outfile', nargs="?", help="Result destination. Default: STDOUT") return parser.parse_args() def main(): """Main wordfreq function.""" args = parse_args() if not args.infile: data = sys.stdin.read() else: data = read_text(infile=args.infile) words = count_words(data) word_freqs = sort_words(words) if not args.outfile: output_results(word_freqs, outstream=sys.stdout) else: with open(args.outfile, 'w') as outfile: output_results(word_freqs, outstream=outfile) ... Switch to using argparse ./wordfreq.py -h
  10. ... def output_results(word_freqs, outstream, count=10): """Output 10 most frequent words."""

    for x in word_freqs[:count]: outstream.write(str(x) + '\n') def parse_args(): """Parse arguments from the command line.""" parser = argparse.ArgumentParser("Count the frequency of words in text.") parser.add_argument('-c', '--count', type=int, default=10, help="Number of words to report. Default: 10") parser.add_argument('infile', nargs="?", help="Source text. Default: STDIN") parser.add_argument('outfile', nargs="?", help="Result destination. Default: STDOUT") return parser.parse_args() def main(): """Main wordfreq function.""" ... if not args.outfile: output_results(word_freqs, outstream=sys.stdout, count=args.count) else: with open(args.outfile, 'w') as outfile: output_results(word_freqs, outstream=outfile, count=args.count) ... Parameterize number of words ./wordfreq.py -c 20 gettysburg.txt
  11. import os ... def parse_envvars(): """Get options from environment variables."""

    return { 'count': os.environ.get('WF_COUNT'), 'infile': os.environ.get('WF_INFILE'), 'outfile': os.environ.get('WF_OUTFILE'), } ... Environment Variables (1/2) WF_COUNT=20 ./wordfreq.py gettysburg.txt
  12. ... def wordfreq(count=10, infile=None, outfile=None): if not infile: data =

    sys.stdin.read() else: data = read_text(infile=infile) words = count_words(data) word_freqs = sort_words(words) if not outfile: output_results(word_freqs, outstream=sys.stdout, count=count) else: with open(outfile, 'w') as outfile: output_results(word_freqs, outstream=outfile, count=count) ... Split functional code from options
  13. ... def main(): """Main wordfreq function.""" args = parse_args() env

    = parse_envvars() count = int(args.count or env['count']) or 10 infile = args.infile or env['infile'] or None outfile = args.outfile or env['outfile'] or None wordfreq(count, infile, outfile) ... Environment Variables (2/2)
  14. import ConfigParser ... def parse_config(conffile): """Get options from a config

    file.""" config = ConfigParser.ConfigParser() config.read(conffile) options = {} if config.has_option('wordfreq', 'count'): options['count'] = config.getint('wordfreq', 'count') if config.has_option('wordfreq', 'infile'): options['infile'] = config.get('wordfreq', 'infile') if config.has_option('wordfreq', 'outfile'): options['outfile'] = config.get('wordfreq', 'outfile') return options def main(): ... conf = parse_config('wordfreq.conf') count = int(args.count or env['count'] or conf.get('count')) or 10 infile = args.infile or env['infile'] or conf.get('infile') or None outfile = args.outfile or env['outfile'] or conf.get('outfile') or None wordfreq(count, infile, outfile) ... Configuration File $ cat wordfreq.conf [wordfreq] count: 3 ./wordfreq.py gettysburg.txt
  15. ... def main(): ... try: wordfreq(count, infile, outfile) return 0

    except: print >> sys.stderr, "An error occurred" return 1 if __name__ == '__main__': sys.exit(main()) Use return code ./wordfreq.py missing.txt
  16. $ cat setup.py from setuptools import setup setup( name="wordfreq", author="Greg

    Back", version='0.0.1', pymodules=['wordfreq'], entry_points={ 'console_scripts': ['wordfreq = wordfreq:main'] } ) Add setup.py pip install wordfreq wordfreq -c 15 gettysburg.txt python -c "import wordfreq; wordfreq.wordfreq('gettysburg.txt', count=25)"
  17. Summary 1. Convert scripts to importable modules 2. Parameterize input/output.

    Permit STDIN/STDOUT. 3. Gather options from ◦ arguments ◦ environment variables ◦ config files 4. Split core logic from option parsing 5. Use proper return values 6. Make module installable