Slide 1

Slide 1 text

Best Practices for Python Scripting Greg Back PyOhio 2015 @gtback /gtback

Slide 2

Slide 2 text

The Problem Find the 10 words that occur most frequently in the Gettysburg Address.

Slide 3

Slide 3 text

Let’s play in the REPL Python 2.7.10 (default, Jun 10 2015, 19:42:47) [GCC 4.2.1 Compatible Apple LLVM 6.1.0 (clang-602.0.53)] on darwin Type "help", "copyright", "credits" or "license" for more information. >>> with open('gettysburg.txt') as f: ... data = f.read() ... >>> import collections >>> words = collections.defaultdict(int) >>> for word in data.split(): ... word = word.strip('-,.').lower() ... words[word] += 1 ... >>> import operator >>> word_freqs = sorted(words.items(), key=operator.itemgetter(1), reverse=True) >>> with open('results.txt', 'w') as f: ... for x in word_freqs[:10]: ... f.write(str(x) + '\n') ... print(x) ...

Slide 4

Slide 4 text

And the answer is... ('that', 13) ('the', 11) ('we', 10) ('to', 8) ('here', 8) ('a', 7) ('nation', 5) ('not', 5) ('for', 5) ('can', 5)

Slide 5

Slide 5 text

Convert to a script $ cat wordfreq.py with open('gettysburg.txt') as f: data = f.read() import collections words = collections.defaultdict(int) for word in data.split(): word = word.strip('-,.').lower() words[word] += 1 import operator word_freqs = sorted(words.items(), key=operator.itemgetter(1), reverse=True) with open('results.txt', 'w') as f: for x in word_freqs[:10]: f.write(str(x) + '\n') print(x) python wordfreq.py

Slide 6

Slide 6 text

#!/usr/bin/env python """ wordfreq.py - Count the frequency of words in text. """ import collections import operator # Read text with open('gettysburg.txt') as f: data = f.read() # Collect occurences of each word words = collections.defaultdict(int) for word in data.split(): word = word.strip('-,.').lower() words[word] += 1 # Sort the word data from most frequent to least frequent. word_freqs = sorted(words.items(), key=operator.itemgetter(1), reverse=True) # Output 10 most frequent words. with open('results.txt', 'w') as f: for x in word_freqs[:10]: f.write(str(x) + '\n') print(x) Clean up ./wordfreq.py python -c "import wordfreq"

Slide 7

Slide 7 text

... import collections import operator def main(): # Read text with open('gettysburg.txt') as f: data = f.read() # Collect occurences of each word words = collections.defaultdict(int) for word in data.split(): word = word.strip('-,.').lower() words[word] += 1 # Sort the word data from most frequent to least frequent. word_freqs = sorted(words.items(), key=operator.itemgetter(1), reverse=True) # Output 10 most frequent words. with open('results.txt', 'w') as f: for x in word_freqs[:10]: f.write(str(x) + '\n') print(x) main() Add main function

Slide 8

Slide 8 text

... def main(): ... # Output 10 most frequent words. with open('results.txt', 'w') as f: for x in word_freqs[:10]: f.write(str(x) + '\n') print(x) if __name__ == '__main__': main() “Protect” main function ./wordfreq.py python -c "import wordfreq"

Slide 9

Slide 9 text

... def read_text(): """Read text""" ... def count_words(data): """Count occurences of each word""" ... def sort_words(words): """Sort the word data from most frequent to least frequent.""" ... def output_results(word_freqs): """Output 10 most frequent words.""" ... def main(): """Main wordfreq function.""" data = read_text() words = count_words(data) word_freqs = sort_words(words) output_results(word_freqs) ... Split up logic

Slide 10

Slide 10 text

... def output_results(word_freqs, outstream): """Output 10 most frequent words.""" for x in word_freqs[:10]: outstream.write(str(x) + '\n') def main(): """Main wordfreq function.""" if len(sys.argv) < 2: data = sys.stdin.read() else: data = read_text(infile=sys.argv[1]) words = count_words(data) word_freqs = sort_words(words) if len(sys.argv) < 3: output_results(word_freqs, outstream=sys.stdout) else: with open(sys.argv[2], 'w') as outfile: output_results(word_freqs, outstream=outfile) ... Parameterize input and output files ./wordfreq.py gettysburg.txt results.txt cat alice.txt | ./wordfreq.py > alice_results.txt

Slide 11

Slide 11 text

import argparse ... def parse_args(): """Parse arguments from the command line.""" parser = argparse.ArgumentParser("Count the frequency of words in text.") parser.add_argument('infile', nargs="?", help="Source text. Default: STDIN") parser.add_argument('outfile', nargs="?", help="Result destination. Default: STDOUT") return parser.parse_args() def main(): """Main wordfreq function.""" args = parse_args() if not args.infile: data = sys.stdin.read() else: data = read_text(infile=args.infile) words = count_words(data) word_freqs = sort_words(words) if not args.outfile: output_results(word_freqs, outstream=sys.stdout) else: with open(args.outfile, 'w') as outfile: output_results(word_freqs, outstream=outfile) ... Switch to using argparse ./wordfreq.py -h

Slide 12

Slide 12 text

... def output_results(word_freqs, outstream, count=10): """Output 10 most frequent words.""" for x in word_freqs[:count]: outstream.write(str(x) + '\n') def parse_args(): """Parse arguments from the command line.""" parser = argparse.ArgumentParser("Count the frequency of words in text.") parser.add_argument('-c', '--count', type=int, default=10, help="Number of words to report. Default: 10") parser.add_argument('infile', nargs="?", help="Source text. Default: STDIN") parser.add_argument('outfile', nargs="?", help="Result destination. Default: STDOUT") return parser.parse_args() def main(): """Main wordfreq function.""" ... if not args.outfile: output_results(word_freqs, outstream=sys.stdout, count=args.count) else: with open(args.outfile, 'w') as outfile: output_results(word_freqs, outstream=outfile, count=args.count) ... Parameterize number of words ./wordfreq.py -c 20 gettysburg.txt

Slide 13

Slide 13 text

import os ... def parse_envvars(): """Get options from environment variables.""" return { 'count': os.environ.get('WF_COUNT'), 'infile': os.environ.get('WF_INFILE'), 'outfile': os.environ.get('WF_OUTFILE'), } ... Environment Variables (1/2) WF_COUNT=20 ./wordfreq.py gettysburg.txt

Slide 14

Slide 14 text

... def wordfreq(count=10, infile=None, outfile=None): if not infile: data = sys.stdin.read() else: data = read_text(infile=infile) words = count_words(data) word_freqs = sort_words(words) if not outfile: output_results(word_freqs, outstream=sys.stdout, count=count) else: with open(outfile, 'w') as outfile: output_results(word_freqs, outstream=outfile, count=count) ... Split functional code from options

Slide 15

Slide 15 text

... def main(): """Main wordfreq function.""" args = parse_args() env = parse_envvars() count = int(args.count or env['count']) or 10 infile = args.infile or env['infile'] or None outfile = args.outfile or env['outfile'] or None wordfreq(count, infile, outfile) ... Environment Variables (2/2)

Slide 16

Slide 16 text

import ConfigParser ... def parse_config(conffile): """Get options from a config file.""" config = ConfigParser.ConfigParser() config.read(conffile) options = {} if config.has_option('wordfreq', 'count'): options['count'] = config.getint('wordfreq', 'count') if config.has_option('wordfreq', 'infile'): options['infile'] = config.get('wordfreq', 'infile') if config.has_option('wordfreq', 'outfile'): options['outfile'] = config.get('wordfreq', 'outfile') return options def main(): ... conf = parse_config('wordfreq.conf') count = int(args.count or env['count'] or conf.get('count')) or 10 infile = args.infile or env['infile'] or conf.get('infile') or None outfile = args.outfile or env['outfile'] or conf.get('outfile') or None wordfreq(count, infile, outfile) ... Configuration File $ cat wordfreq.conf [wordfreq] count: 3 ./wordfreq.py gettysburg.txt

Slide 17

Slide 17 text

... def main(): ... try: wordfreq(count, infile, outfile) return 0 except: print >> sys.stderr, "An error occurred" return 1 if __name__ == '__main__': sys.exit(main()) Use return code ./wordfreq.py missing.txt

Slide 18

Slide 18 text

$ cat setup.py from setuptools import setup setup( name="wordfreq", author="Greg Back", version='0.0.1', pymodules=['wordfreq'], entry_points={ 'console_scripts': ['wordfreq = wordfreq:main'] } ) Add setup.py pip install wordfreq wordfreq -c 15 gettysburg.txt python -c "import wordfreq; wordfreq.wordfreq('gettysburg.txt', count=25)"

Slide 19

Slide 19 text

Summary 1. Convert scripts to importable modules 2. Parameterize input/output. Permit STDIN/STDOUT. 3. Gather options from ○ arguments ○ environment variables ○ config files 4. Split core logic from option parsing 5. Use proper return values 6. Make module installable