-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStage3.py
58 lines (43 loc) · 1.7 KB
/
Stage3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from bs4 import BeautifulSoup
import os
import requests
import csv
import time
with open('Stage2.csv', mode='r') as infile:
reader = csv.reader(infile)
mydict = {rows[0]:rows[1] for rows in reader}
secondphaselinksandfilenames = {}
for url,filename in mydict.items():
print (url)
#page = (requests.get('http://edwards.yale.edu/archive?path=aHR0cDovL2Vkd2FyZHMueWFsZS5lZHUvY2dpLWJpbi9uZXdwaGlsby9nZXRvYmplY3QucGw/Yy4xODo3LndqZW8='))
#soup = BeautifulSoup(page.text, "html.parser")
#soup = soup.select ('#content #text')
#if os.path.isfile('test3') is True: os.remove('test3')
#for items in soup:
# with open('test3', "a") as myfile:
# myfile.write(str(items))
with open('test3', 'r') as f:
contents = f.read()
soup = BeautifulSoup(contents, "html.parser")
if os.path.isfile('test4.html') is True: os.remove('test4.html')
for items in soup:
with open('test4.html', "a") as myfile: myfile.write(str(items))
for unwanted in soup("center"):
unwanted.decompose()
for unwanted in soup("a"):
unwanted.decompose()
for unwanted in soup("hr"):
unwanted.decompose()
if os.path.isfile('test4.html') is True: os.remove('test4.html')
soup= str(soup)
with open('stylefinal', 'r') as f: style = f.read()
with open('test4.html', "w") as myfile: myfile.write(str(style))
for line in soup.splitlines():
if '</p><p></p>' in line:
data = line.replace('</p><p></p><p>','')
with open('test4.html', "a") as myfile: myfile.write(str(data))
if '</p><p></p><p>' not in line:
data = (line)
with open('test4.html', "a") as myfile: myfile.write(str(data))
import webbrowser
webbrowser.open('file:///Users/williamcorney/Edwards/test4.html', new=2)