forked from Ahmed-Sabri/crawlee-python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathappz.py
More file actions
40 lines (30 loc) · 1.34 KB
/
appz.py
File metadata and controls
40 lines (30 loc) · 1.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import asyncio
import os
from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
# Ensure the directory exists
output_dir = os.path.join(os.getcwd(), 'output-folder')
os.makedirs(output_dir, exist_ok=True)
async def main() -> None:
crawler = PlaywrightCrawler(
max_requests_per_crawl=500,
)
@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url}')
# Extract the content of the page
content = await context.page.content()
# Generate a valid filename from the URL
filename = context.request.url.replace('https://', '').replace('/', '_') + '.html'
filepath = os.path.join(output_dir, filename)
# Save the content to a file
with open(filepath, 'w', encoding='utf-8') as file:
file.write(content)
context.log.info(f'Saved {context.request.url} to {filepath}')
# Find and enqueue links to other pages within the documentation
await context.enqueue_links(
selector='a', # Adjust the selector as needed to target all relevant links
label='DETAIL',
)
await crawler.run(['https://example.com/'])
if __name__ == '__main__':
asyncio.run(main())