# CS 150 class example """ Extract data from a web page. """ import urllib.request import sys SOURCE_PAGE = "http://www.cs.middlebury.edu/~cm2/tutors/pages/cs101_150_tutoring.html" PREFIX = "zoom.us/j/" def get_data(url): """ Extract addresses from webpage Extract all addresses specified in a link, e.g. with PREFIX = "mailto:" finds with PREFIX = "zoom.us/j/" finds Args: url: URL of page to scrape Returns: List of scraped addresses """ scraped = [] with urllib.request.urlopen(url) as webpage: # Iterate through each line of webpage, just like a file for line in webpage: line = line.decode('utf-8', 'ignore') # Obtain a string from the raw bytes # Search for instances of PREFIX in the line begin_index = line.find(PREFIX) if begin_index != -1: begin_index += len(PREFIX) # Advance to first character in address end_index = line.find('"', begin_index) # Find ending quote scraped.append(line[begin_index:end_index]) # Extract address return scraped def write_list_to_file(data, output_file, header): """ Write out all data to a file Args: data: list of strings output_file: path of output file header: string to write as header of output file """ with open(output_file, "w") as file: file.write(header + "\n") for item in data: file.write(item + "\n") outfile = "scraped.txt" data = get_data(SOURCE_PAGE) header = "Scraped data" write_list_to_file(data, outfile, header)