# CS 150 class example
"""
Extract data from a web page.
"""
import urllib.request
import sys
#PREFIX = "zoom.us/j/"
PREFIX = "mailto:"
def get_data(url):
"""
Extract addresses from webpage
Extract all addresses specified in a link, e.g.
with PREFIX = "mailto:" finds
with PREFIX = "zoom.us/j/" finds
Args:
url: URL of page to scrape
Returns:
List of scraped addresses
"""
scraped = []
with urllib.request.urlopen(url) as webpage:
# Iterate through each line of webpage, just like a file
for line in webpage:
line = line.decode('utf-8', 'ignore') # Obtain a string from the raw bytes
# Search for instances of PREFIX in the line
begin_index = line.find(PREFIX)
if begin_index != -1:
begin_index += len(PREFIX) # Advance to first character in address
end_index = line.find('"', begin_index) # Find ending quote
scraped.append(line[begin_index:end_index]) # Extract address
return scraped
def write_list_to_file(data, output_file, header):
"""
Write out all data to a file
Args:
data: list of strings
output_file: path of output file
header: string to write as header of output file
"""
with open(output_file, "w") as file:
file.write(header + "\n")
for item in data:
file.write(item + "\n")
def print_usage():
"""Print usage message for program"""
print("python3 web_scraper.py ")
if __name__ == "__main__":
if len(sys.argv) != 4:
print_usage()
else:
url = sys.argv[1]
outfile = sys.argv[2]
header = sys.argv[3]
data = get_data(url)
write_list_to_file(data, outfile, header)
print("Wrote:", outfile)