-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathipo.py
160 lines (127 loc) · 5.35 KB
/
ipo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import requests
import pandas as pd
from bs4 import BeautifulSoup
from openpyxl import Workbook
# Global vars
use_count = False
def main():
# Example usage
url = 'https://www.chittorgarh.com/report/ipo-in-india-list-main-board-sme/82/' # Replace with the actual webpage URL
output_file = 'ipo_data.xlsx' # Output Excel file path
parse_webpage(url, output_file)
key_mapping = {
'Others': 'qib',
'Qualified Institutional': 'qib',
'Qualified Institutions' : 'qib',
'Non Institutional': 'hni',
'Non-Institutional Buyers*': 'hni',
'Non-Institutional Buyers': 'hni',
'Retail Investors' : 'rii',
'Retail Individual' : 'rii',
'Total Subscription' : 'total',
'Total': 'total',
}
def get_subscription_details(url):
# Send a GET request to the webpage URL
response = requests.get(url)
# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(response.content, "html.parser")
# Find the specific elements containing the desired data
subscription_details = soup.find("div", itemtype='https://schema.org/Table')
if subscription_details is None:
print("Subscription details not found on the webpage.")
return None
subscription_table = subscription_details.find("table")
if subscription_table is None:
print("Subscription table not found on the webpage.")
return None
# Extract the subscription details from the table
data = {}
mapped_data = {}
rows = subscription_table.find_all("tr")
headers = [header.text.strip() for header in subscription_table.select('thead tr th')]
for row in subscription_table.select('tbody tr'):
row_data = [data.text.strip() for data in row.select('td')]
if len(row_data) == len(headers):
data[row_data[0]] = row_data[1]
# It checks the above dictionary and maps the irregular ipo category to (qib,rii,hni etc)
# Also note Others --> qib (is classified as QIB quota only)
mapped_data = {key_mapping.get(key, key): value for key, value in sorted(data.items())}
collected_data = {}
selected_keys = ['qib', 'hni', 'rii', 'total'] # select only these fields
for key in selected_keys:
if key in mapped_data:
if mapped_data[key] == "[.]":
collected_data[key] = 0
else:
collected_data[key] = float(mapped_data[key])
else:
collected_data[key] = 0
return collected_data
def parse_webpage(url, output_file):
# Send a GET request to the webpage
response = requests.get(url)
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
# Find the table of interest based on the id attribute
table_div = soup.find('div', id='report_data')
# Extract the table from the div
table = table_div.find('table')
# Extract the table headers
headers = [header.text.strip() for header in table.select('thead tr th')]
headers.insert(0, 'URL') # Insert the new header 'URL' at the beginning
headers.extend(["qib", "hni", "rii", "total"])
# Extract the table rows
rows = []
count = 0
failed = []
for row in table.select('tbody tr'):
# Find the link element in the first column
link = row.find('a')
# Extract the URL and title attributes
href = link['href']
href = href.replace("/ipo/", "/ipo_subscription/")
row_data = [href] + [data.text.strip() for data in row.select('td')]
try:
# Call the get_subscription_info function to fetch additional stock details
stock_details = get_subscription_details(href)
if len(stock_details) == 4:
row_data.extend(stock_details.values())
rows.append(row_data)
except Exception as e:
failed.append(row_data)
print(f"Failed for {row_data}")
print(row_data)
count += 1
if count == 15 and use_count:
break
# open a txt file to write for failed fetch
with open("failed.txt", "w") as fh:
for item in failed:
fh.write(str(item) + "\n")
# Convert the data to a pandas DataFrame
df = pd.DataFrame(rows, columns=headers)
# Drop the specified columns from the DataFrame
df = df.drop(columns=["Open Date", "Lot Size", "Compare"])
# Column order in which data needs to be printed
desired_column_order = [ "Close Date",
"Issuer Company",
"Issue Size (Rs Cr.)",
"Issue Price (Rs)",
"qib",
"hni",
"rii",
"total",
"URL",
"Exchange",
]
# Filter the DataFrame based on the condition 'qib' <= 25
filtered_df = df[df['qib'] >= 25]
# Add an additional condition for 'qib' >= 50 when 'Exchange' is "NSE SME" or "BSE SME"
filtered_df = filtered_df[filtered_df.apply(lambda row: row['qib'] >= 50 if row['Exchange'] in ["NSE SME", "BSE SME"] else True, axis=1)]
# Reorder the columns in the filtered DataFrame
filtered_df = filtered_df[desired_column_order]
# Export the filtered DataFrame to an Excel file
filtered_df.to_excel(output_file, index=False)
if __name__ == "__main__":
main()