Python for Everybody

Chapter 12

Exercise 12.3

"""
Exercise  12.3: Use urllub to replicate the previous exercise of (1) retrieving
the document from a URL, (2) displaying up to 3000 characters, and (3) counting
the overall characters in the document. Don't worry about the headers for this
exercise, simply show the first 3000 characters of the document contents.

Python for Everybody: Exploring Data Using Python 3
by Charles R. Severance

Solution by Jamison Lahman, June 4, 2017
"""
import urllib.request
import urllib.parse
import urllib.error


fhand = urllib.request.urlopen('http://data.pr4e.org/romeo.txt')

characters = 0
for line in fhand:
    # \n is considered a character
    # Amend to line.decode().rstrip() if needed
    words = line.decode()
    characters = characters + len(words)
    if characters < 3000:
        print(line.decode().strip())
print(characters)

Exercise 12.4

"""
Exercise  12.4: Change the urllinks.py program to extract and count paragraph
(p) tags from the retrieved HTML document and display the count of the
paragraphs as the output of your program. Do not display the paragraph text,
only count them. Test your program on several small pages as well as some
larger web pages.

Python for Everybody: Exploring Data Using Python 3
by Charles R. Severance

Solution by Jamison Lahman, June 4, 2017
"""
import urllib.request
import urllib.parse
import urllib.error
import ssl
from bs4 import BeautifulSoup


count = 0                               # Initialize variables
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

url = input('Enter - ')
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')

# Retrieve all of the anchor tags
tags = soup('p')
for tag in tags:
    count += 1                          # Counter
print(count)

Exercise 12.5

"""
Exercise  12.5: (Advanced) Change the socket program so that it only shows
data after the headers and a blank line have been received. Remember that
recv is receiving characters (newlines and all), not lines.

Python for Everybody: Exploring Data Using Python 3
by Charles R. Severance

Solution by Jamison Lahman, June 5, 2017
"""
import socket
import re

my_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
my_sock.connect(('data.pr4e.org', 80))
cmd = 'GET http://data.pr4e.org/romeo.txt HTTP/1.0\n\n'.encode()
my_sock.send(cmd)

data = my_sock.recv(512)
message = data.decode()
header_end_pos = message.find('\r\n\r\n') + 4   # Finds the end of header
                                            # Adds four to exclude:'\r\n\r\n'
print(message[header_end_pos:])
while True:                                 # Header in the first data only
    data = my_sock.recv(512)
    if not data:
        break
    print(data.decode())
my_sock.close()