-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain4.py
121 lines (90 loc) · 3.63 KB
/
main4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from bs4 import BeautifulSoup
import os, requests
import pandas as pd
from tabulate import tabulate
from urllib.parse import urlparse
import time
if os.path.isfile('index.html') is True: os.remove('index.html')
if os.path.isfile('cache') is False:
page = (requests.get('http://edwards.yale.edu/research/browse'))
soup = BeautifulSoup(page.text, "html.parser")
with open("cache", "w") as myfile:
myfile.write(str(soup))
else:
with open('cache', 'r') as f:
contents = f.read()
soup = BeautifulSoup(contents, "html.parser")
for links in soup.findAll('a'):
filename = os.path.basename(links.get('href'))
dirname = os.path.dirname(links.get('href'))
if "archive" in dirname:
url = dirname
links['href'] = links['href'].replace(filename, filename + '.html')
links['href'] = links['href'].replace(dirname + '/', '')
result = soup.select('style, #center ul ')
for items in result:
with open("index.html", "a") as myfile: myfile.write(str(items))
downloadlist = {}
result2 = soup.select('#center ul ')
for links in result2:
raw = links.select('a')
for items in raw:
nameoffile = items.get('href')
urloffile = 'http://edwards.yale.edu' + url
nameoffile = nameoffile.replace('.html', '')
downloadlist[nameoffile] = urloffile
print (str(len(raw)) + ' links were processed ')
print (str(len(downloadlist)) + ' links added to downloadlist dictionary')
for nameoffile, urloffile in downloadlist.items():
fulllinkurl = (urloffile + "/" + nameoffile)
fullfilename = nameoffile + ".html"
if os.path.isfile(fullfilename) is False:
page = requests.get(fulllinkurl)
soup = BeautifulSoup(page.text, "html.parser")
with open(fullfilename, "w") as myfile: myfile.write(str(soup))
for nameoffile, urloffile in downloadlist.items():
fulllinkurl = (urloffile + "/" + nameoffile)
fullfilename = nameoffile + ".html"
with open(fullfilename, 'r') as f:
contents = f.read()
soup = BeautifulSoup(contents, "html.parser")
for unwanted in soup("input"):
unwanted.decompose()
result = soup.select(' .navlevel1 , .navlevel2')
if os.path.isfile(fullfilename + '.tmp') is False:
for items in result:
with open(fullfilename + '.tmp', "a") as myfile: myfile.write(str(items))
with open(fullfilename + '.tmp', 'r') as f:
contents = f.read()
soup = BeautifulSoup(contents, "html.parser")
for links in soup.findAll('a'):
filename = os.path.basename(links.get('href'))
dirname = os.path.dirname(links.get('href'))
url = dirname
links['href'] = links['href'].replace(filename, filename + '.html')
links['href'] = links['href'].replace(dirname + '/', '')
with open('stylefile', 'r') as f:
contents = f.read()
#print (contents)
print ('arrived')
os.remove(fullfilename)
if os.path.isfile(fullfilename + '.tmp') is True:
os.remove(fullfilename + '.tmp')
with open(fullfilename, "w") as myfile:
myfile.write(str(contents))
for items in soup:
with open(fullfilename, "a") as myfile: myfile.write(str(items))
"""
downloadlist = {}
result2 = soup
for links in result2:
raw = links.select('a')
for items in raw:
key = items.get('href')
val = 'http://edwards.yale.edu/archive' + url
key = key.replace('.html', '')
downloadlist[key] = val
for item,val in downloadlist.items():
downloadurl = (val + "/" + item)
filename = item + ".html"
"""