-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain3.py
124 lines (88 loc) · 3.47 KB
/
main3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from bs4 import BeautifulSoup
import os,requests
import pandas as pd
from tabulate import tabulate
from urllib.parse import urlparse
if os.path.isfile('index.html') is True: os.remove('index.html')
if os.path.isfile('cache') is False:
page = (requests.get ('http://edwards.yale.edu/research/browse'))
soup = BeautifulSoup(page.text, "html.parser")
with open("cache", "w") as myfile:
myfile.write(str(soup))
else:
with open('cache', 'r') as f: contents = f.read()
soup = BeautifulSoup(contents, "html.parser")
for links in soup.findAll('a'):
filename = os.path.basename(links.get('href'))
dirname = os.path.dirname(links.get('href'))
if "archive" in dirname:
url = dirname
links['href'] = links['href'].replace (filename, filename + '.html')
links['href'] = links['href'].replace(dirname + '/', '')
result = soup.select('style, #center ul ')
for items in result:
with open("index.html", "a") as myfile: myfile.write(str(items))
downloadlist = {}
result2 = soup.select('#center ul ')
for links in result2:
raw = links.select('a')
for items in raw:
key = items.get('href')
val = 'http://edwards.yale.edu' + url
key = key.replace('.html', '')
downloadlist[key] = val
print ('There are ' + str(len(downloadlist)) + ' Volumes')
count = 0
for item,val in downloadlist.items():
downloadparent = (val + "/" + item)
fileparent = item + ".html"
if os.path.isfile(fileparent) is False:
count = count + 1
print(count)
print (fileparent + "does not exist")
#print ('there are ' + str(len(fileparent)) + " in this list")
page = requests.get(downloadparent)
soup = BeautifulSoup(page.text, "html.parser")
with open(fileparent, "w") as myfile: myfile.write(str(soup))
with open(fileparent, 'r') as f: contents = f.read()
soup = BeautifulSoup(contents, "html.parser")
for unwanted in soup ("input"):
unwanted.decompose()
result = soup.select('style , .navlevel1 , .navlevel2')
style = soup.select('style')
#print (result)
count = 0
for items in result:
count = count + 1
#print (str(items) + 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' + str(count))
with open('zzz' + fileparent, "a") as myfile: myfile.write(str(items))
with open('zzz' + fileparent, 'r') as f: contents = f.read()
soup = BeautifulSoup(contents, "html.parser")
print(soup)
#print (soup)
os.remove('zzz' + fileparent)
for links in soup.findAll('a'):
#print (links)
filename = os.path.basename(links.get('href'))
dirname = os.path.dirname(links.get('href'))
url = dirname
links['href'] = links['href'].replace(filename, filename + '.html')
links['href'] = links['href'].replace(dirname + '/', '')
for items in style:
with open('zzz' + fileparent, "a") as myfile: myfile.write(str(items))
for items in soup:
with open('zzz' + fileparent, "a") as myfile: myfile.write(str(items))
"""
downloadlist = {}
result2 = soup
for links in result2:
raw = links.select('a')
for items in raw:
key = items.get('href')
val = 'http://edwards.yale.edu/archive' + url
key = key.replace('.html', '')
downloadlist[key] = val
for item,val in downloadlist.items():
downloadurl = (val + "/" + item)
filename = item + ".html"
"""