-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathUsing_Regular_Expression_and_Extracting_data_from_large_files.py
134 lines (113 loc) · 3.63 KB
/
Using_Regular_Expression_and_Extracting_data_from_large_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#Using Regular expression
#1. Extracting the IP address
#Regular expression are used for matching patterns
#\d+: matches one or more digit
#'\.': matches a literal dot
#\d+\.\d+\.\d+\.\d+ : matches an IP address
#?P<IP>: named group. part inside paranthesis(?P<IP>...) will be accsesible
#by the name IP.
import re
line = '127.0.0.1 - rj [13/Nov/2019:14:43:30] "GET HTTP/1.0" 200'
ip_pattern = r'(?P<IP>\d+\.\d+\.\d+\.\d+)'
match = re.search(ip_pattern, line) #re.search for pattern
if match:
print(match.group('IP'))
"""
#O/P: 127.0.0.1
"""
#2. Extracting the time
#'\[': matches opening square bracket
#\d\d: matches exactly two digit(representing date)
#\w{3}: matches 3 word character(represent month such as Nov)
#\d{4}: matches 4 digit(representing year)
#\d{2}:\d{2}:\d{2}: hours: minutes: seconds
#'\]': closing square bracket
#(?P<Time>): is a named group, the part inside the parenthesis
# will be accessible by the name 'Time'
import re
line = '127.0.0.1 - rj [13/Nov/2019:14:43:30] "GET HTTP/1.0" 200'
time_pattern = r'\[(?P<Time>\d\d/\w{3}/\d{4}:\d{2}:\d{2}:\d{2})\]'
match = re.search(time_pattern,line)
if match:
print(match.group('Time'))
"""
#O/P:13/Nov/2019:14:43:30
"""
#3. Extracting Multiple elments(IP, User, Time, and Request):
import re
line = '127.0.0.1 - rj [13/Nov/2019:14:43:30] "GET HTTP/1.0" 200'
full_pattern = (
r'(?P<IP>\d+\.\d+\.\d+\.\d+)' #IP address
r' - (?P<User>\w+)' #User
r' \[(?P<Time>\d\d/\w{3}/\d{4}:\d{2}:\d{2}:\d{2})\]' #Time
r' (?P<Request>".+")'
)
match =re.search(full_pattern, line)
if match:
print(match.group('IP'))
print(match.group('Time'))
print(match.group('User'))
print(match.group('Request'))
"""
O/P: 127.0.0.1
13/Nov/2019:14:43:30
rj
"GET HTTP/1.0"
"""
#4. Extracting all IP addresses for GET requests on a specific date
import re
log_data = """
127.0.0.1 - swills [08/Nov/2019:14:43:30 -0800] "GET /assets/234 HTTP/1.0" 200 2326
192.168.1.1 - john [08/Nov/2019:15:12:34 -0800] "GET /index.html HTTP/1.1" 200 3421
10.0.0.1 - jane [08/Nov/2019:16:22:10 -0800] "POST /login HTTP/1.1" 200 532
342.3.2.33 - doe [08/Nov/2019:17:45:50 -0800] "GET /home HTTP/1.0" 404 231
"""
get_request_pattern = (
r'(?P<IP>\d+\.\d+\.\d+\.\d+)' #IP address
r' - (?P<User>\w+)'
#r' \[(?P<Time>08/Nov/2019:\d{2}:\d{2}:\d{2} [-+]\d{4})\] '
r' \[(?P<Time>08/Nov/2019:\d{2}:\d{2}:\d{2} [-+]\d{4})\] '
r'"(?P<Request>GET .+?)"'
)
matches = re.finditer(get_request_pattern,log_data)
for match in matches:
print(match.group('IP'))
print(match.group("User"))
print(match.group('Time'))
print(match.group('Request'))
"""
#O/P:
127.0.0.1
swills
08/Nov/2019:14:43:30 -0800
GET /assets/234 HTTP/1.0
192.168.1.1
john
08/Nov/2019:15:12:34 -0800
GET /index.html HTTP/1.1
342.3.2.33
doe
08/Nov/2019:17:45:50 -0800
GET /home HTTP/1.0
"""
#Dealing with Large File
#Processing Large Text Files line by line
source_file=r'/home/gauravmtwt1/Downloads/log.txt'
with open(source_file,'r') as source_doc:
with open('source_file_corrected.txt', 'w') as target_file:
for line in source_doc :
target_file.write(line)
#Process large binary file in chunk(Useful for image/video)
#ab: append in binary
#Define a sample process data function that writes chunk to a new file
def process_data(chunk):
with open('processed_image.png','ab') as target_file:
target_file.write(chunk)
#Reading the image file in chunks and processing each chunk
with open('Processs_image_using_python.png','rb') as source_file:
while True:
chunk = source_file.read(1024)
if chunk:
process_data(chunk)
else:
break