# CS 150 class example """ Extract data from a web page. """ import urllib.request import sys #PREFIX = "zoom.us/j/" PREFIX = "mailto:" def get_data(url): """ Extract addresses from webpage Extract all addresses specified in a link, e.g. with PREFIX = "mailto:" finds with PREFIX = "zoom.us/j/" finds Args: url: URL of page to scrape Returns: List of scraped addresses """ scraped = [] with urllib.request.urlopen(url) as webpage: # Iterate through each line of webpage, just like a file for line in webpage: line = line.decode('utf-8', 'ignore') # Obtain a string from the raw bytes # Search for instances of PREFIX in the line begin_index = line.find(PREFIX) if begin_index != -1: begin_index += len(PREFIX) # Advance to first character in address end_index = line.find('"', begin_index) # Find ending quote scraped.append(line[begin_index:end_index]) # Extract address return scraped def write_list_to_file(data, output_file, header): """ Write out all data to a file Args: data: list of strings output_file: path of output file header: string to write as header of output file """ with open(output_file, "w") as file: file.write(header + "\n") for item in data: file.write(item + "\n") def print_usage(): """Print usage message for program""" print("python3 web_scraper.py
") if __name__ == "__main__": if len(sys.argv) != 4: print_usage() else: url = sys.argv[1] outfile = sys.argv[2] header = sys.argv[3] data = get_data(url) write_list_to_file(data, outfile, header) print("Wrote:", outfile)