import asyncio
from user_agent import generate_user_agent
import time
from _datetime import datetime as dt
import random
import aiohttp
import sqlite3
import aiosqlite
import aiofiles
def create_table():
with sqlite3.connect('comments.db') as conn:
c = conn.cursor()
c.execute('''
CREATE TABLE IF NOT EXISTS comments (
id INTEGER PRIMARY KEY AUTOINCREMENT,
post_id INTEGER,
comment_id INTEGER,
text TEXT,
author TEXT,
published_at TEXT,
summ INTEGER,
replyTo INTEGER
)
''')
c.execute('''
CREATE TABLE IF NOT EXISTS posts (
id INTEGER PRIMARY KEY,
title TEXT,
text TEXT,
author TEXT,
published_at TEXT,
comments INTEGER,
favorites INTEGER,
reposts INTEGER,
views INTEGER,
hits INTEGER,
reads_count INTEGER,
likes INTEGER,
subsite_name TEXT,
author_is_plus INTEGER
)
''')
URL = 'https://api.dtf.ru/v2.31/content'
COMMENTS_URL = 'https://api.dtf.ru/v2.31/comments'
TIMEOUT_SEC = 60
count_workers = 10
START_ID = 2850162
END_ID = 2975982
LIST_ID = [i for i in range(START_ID, END_ID)]
# PROXY
PROTOCOL = 'HTTP'
USERNAME = 'user'
PASSWORD = '1234'
HOST = '45.45.45.45:7276'
PROXY_URL = F'{PROTOCOL}://{USERNAME}:{PASSWORD}@{HOST}'
async def insert_data(table, data):
async with aiosqlite.connect('comments.db') as db:
try:
if table == "comments":
await db.executemany(
f'INSERT INTO {table} (post_id, comment_id, text, author, published_at, summ, replyTo) '
f'VALUES (:post_id, :comment_id, :text, :author, :published_at, :summ, :replyTo)',
data)
await db.commit()
else:
await db.execute(f'INSERT INTO {table} VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', data)
await db.commit()
except aiosqlite.IntegrityError:
print(f"Запись с id={data['id']} уже существует в базе данных.")
async def extract_posts_from_json(json_data):
post_data = json_data["result"]
post_id = post_data["id"]
author = post_data["author"]["name"]
published_at = post_data["date"]
comments = post_data["counters"]["comments"]
favorites = post_data["counters"]["favorites"]
reposts = post_data["counters"]["reposts"]
views = post_data["counters"]["views"]
hits = post_data["counters"]["hits"]
reads_count = post_data["counters"]["reads"]
likes = post_data["likes"]["counter"]
post_text = '\n'.join(
[block["data"]["text"] for block in post_data.get("blocks", []) if block.get("type") == "text"])
subsite_info = post_data["subsite"]
subsite_name = subsite_info.get("name")
author_info = post_data["author"]
is_plus = author_info.get("isPlus")
return {
"id": post_id,
"title": post_data.get("title"),
"text": post_text,
"author": author,
"published_at": published_at,
"comments": comments,
"favorites": favorites,
"reposts": reposts,
"views": views,
"hits": hits,
"reads_count": reads_count,
"likes": likes,
"subsite_name": subsite_name,
"author_is_plus": is_plus
}
async def dudos_pdf(current_id):
async with aiofiles.open(file_log, mode='a+') as f:
log = ''
try:
async with aiohttp.ClientSession() as session:
query = {
"id": current_id,
}
status = False
while not status:
user_agent = {'user-agent': generate_user_agent()}
response = await session.get(URL, params=query, headers=user_agent)
if response.status == 200:
status = True
post_json = await response.json()
post_data = await extract_posts_from_json(post_json)
if post_data['id']:
await insert_data('posts', tuple(post_data.values()))
log += f'Post {current_id} successful inserted into DB \n'
query_com = {'contentId': current_id, 'sorting': 'all'}
status_comm = False
while not status_comm:
r = random.uniform(0.3, 1.0)
await asyncio.sleep(r)
response_com = await session.get(COMMENTS_URL, params=query_com, headers=user_agent, proxy=PROXY_URL)
if response_com.status == 200:
status_comm = True
comments_data = await response_com.json()
if "result" in comments_data and "items" in comments_data["result"]:
comment_list = []
for comment in comments_data["result"]["items"]:
if comment["text"] != "Комментарий недоступен":
comment_list.append({
"post_id": current_id,
"comment_id": comment["id"],
"text": comment["text"],
"author": comment["author"]["name"],
"published_at": comment["date"],
"summ": comment["likes"]["summ"],
"replyTo": comment["replyTo"]
})
await insert_data("comments", comment_list)
log += f'Comments from post {current_id} succsessful inserted into DB \n'
elif response_com.status == 429:
log += f'{current_id} id comments, code {response_com.status}, sleep {TIMEOUT_SEC} sec \n'
await f.write
await asyncio.sleep(TIMEOUT_SEC)
else:
status_comm = True
log += f'{current_id} post comments cant fetch, status code {response_com.status} \n'
elif response.status == 404:
status = True
log += f'Post {current_id} dont find. Status code 404 \n'
elif response.status == 429:
log += f'{current_id} id post, code {response.status}, sleep {TIMEOUT_SEC} sec \n'
await asyncio.sleep(TIMEOUT_SEC)
else:
status = True
log += f'Post {current_id} have HZ code {response.status} \n'
r = random.uniform(0.3, 1.0)
await asyncio.sleep(r)
except Exception as err:
log += f'{current_id}, {err}'
finally:
await f.write(log)
async def worker(queue):
while True:
current_id = await queue.get()
await dudos_pdf(current_id)
queue.task_done()
async def main():
async with aiofiles.open(file_log, mode='a+') as fl:
log = ''
start_at_script = dt.now()
print('Script started at ', start_at_script)
log += f'Script started at {start_at_script}\n'
queue = asyncio.Queue()
for current_id in LIST_ID:
queue.put_nowait(current_id)
tasks = []
for i in range(count_workers):
task = asyncio.create_task(worker(queue))
tasks.append(task)
started_at = time.monotonic()
await queue.join()
total_slept_for = time.monotonic() - started_at
for task in tasks:
task.cancel()
await asyncio.gather(*tasks, return_exceptions=True)
log += '====\n'
log += f'Script ended at {dt.now()}\n'
log += f'{count_workers} workers run in parallel for {total_slept_for:.2f} seconds\n'
log += f'Total work time: {dt.now() - start_at_script}\n'
print(log)
await fl.write(log)
if __name__ == "__main__":
global file_log
file_log = f'log_{dt.now().strftime("%Y_%m_%d_%H_%M_%S")}.txt'
with open(file_log, 'a+') as f: pass
print(f'{file_log} created!')
create_table()
asyncio.run(main())
Ржд - гей
ладно
@День РЖД
А это вообще законно? Ну, не влияет ли это на стабильность сайта и интернета?
API DTF открыт и доступен всем желающим + там есть ограничение на количество запросов в секунду. Я же не предлагаю дудосить его
В соседнем посте говорят, что пися Мерил сравнима с мороженым. Но так ли это?