# CS 150 class example

"""
Extract data from a web page.
"""

import urllib.request
import sys

#PREFIX = "zoom.us/j/"
PREFIX = "mailto:"

def get_data(url):
    """
    Extract addresses from webpage
    
    Extract all addresses specified in a link, e.g.
    with PREFIX = "mailto:" finds <a href="mailto:joe@example.com"> 
    with PREFIX = "zoom.us/j/" finds <a href = "https://middlebury.zoom.us/j/5225528930"> 
    
    Args:
        url: URL of page to scrape
        
    Returns:
        List of scraped addresses
    """
    scraped = []    
        
    with urllib.request.urlopen(url) as webpage:
        # Iterate through each line of webpage, just like a file
        for line in webpage:
            line = line.decode('utf-8', 'ignore')  # Obtain a string from the raw bytes
            
            # Search for instances of PREFIX in the line            
            begin_index = line.find(PREFIX)
            if begin_index != -1:
                begin_index += len(PREFIX)  # Advance to first character in address
                end_index = line.find('"', begin_index)  # Find ending quote
                scraped.append(line[begin_index:end_index])  # Extract address               
        return scraped
            

def write_list_to_file(data, output_file, header):
    """
    Write out all data to a file

    Args:
        data: list of strings
        output_file: path of output file
        header: string to write as header of output file
    """
    with open(output_file, "w") as file:
        file.write(header + "\n")
        for item in data:
            file.write(item + "\n")    
    

def print_usage():
    """Print usage message for program"""
    print("python3 web_scraper.py <URL> <out_file> <header>")

if __name__ == "__main__":
    if len(sys.argv) != 4:
        print_usage()
    else:
        url = sys.argv[1]
        outfile = sys.argv[2]
        header = sys.argv[3]
        data = get_data(url)        
        write_list_to_file(data, outfile, header)
        print("Wrote:", outfile)