-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathiprox_parse_urls.py
36 lines (27 loc) · 969 Bytes
/
iprox_parse_urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import argparse
from bs4 import BeautifulSoup
# Create a command-line argument parser
parser = argparse.ArgumentParser(description="Extract URLs from an XML file")
parser.add_argument("xml_file", help="Path to the input XML file")
# Parse the command-line arguments
args = parser.parse_args()
# Read the XML data from the input file
with open(args.xml_file, 'r') as xml_file:
xml_data = xml_file.read()
# Parse the XML data
soup = BeautifulSoup(xml_data, 'xml')
# Find all DatasetFile elements
dataset_files = soup.find_all('DatasetFile')
# Extract the URLs from cvParam tags within DatasetFile elements
urls = []
for dataset_file in dataset_files:
cv_param = dataset_file.find('cvParam', {'name': 'Associated raw file URI'})
if cv_param:
url = cv_param['value']
urls.append(url)
for url in urls:
print(url)
# Write the URLs to a file
output_file = 'urls.txt'
with open(output_file, 'w') as file:
file.write('\n'.join(urls))