diff options
| author | techchud <protrude_paying969@simplelogin.com> | 2025-08-17 19:24:03 -0500 | 
|---|---|---|
| committer | techchud <protrude_paying969@simplelogin.com> | 2025-08-17 19:24:03 -0500 | 
| commit | 70c0c24ffba1e9032069574e121cdeb7ba13323f (patch) | |
| tree | 8371e9a1120a2b6ceb18fccc117fdf374236973b /aux-files | |
| parent | 23c973ed8ec18cf7e3dfc12154fd1277c9f0b4d1 (diff) | |
| download | guix-techchud-70c0c24ffba1e9032069574e121cdeb7ba13323f.tar.gz guix-techchud-70c0c24ffba1e9032069574e121cdeb7ba13323f.tar.bz2 guix-techchud-70c0c24ffba1e9032069574e121cdeb7ba13323f.zip | |
Revert "remove snsscrape patch"
This reverts commit 12a4266ca2a4916e16cb387a4f65dbea0f217d6f.
Diffstat (limited to 'aux-files')
| -rw-r--r-- | aux-files/snscrape/snscrape-downloads-telegram.patch | 495 | 
1 files changed, 495 insertions, 0 deletions
| diff --git a/aux-files/snscrape/snscrape-downloads-telegram.patch b/aux-files/snscrape/snscrape-downloads-telegram.patch new file mode 100644 index 0000000..46665c4 --- /dev/null +++ b/aux-files/snscrape/snscrape-downloads-telegram.patch @@ -0,0 +1,495 @@ +From 00239388e3096277a55271a8786b4b5d6d2bec84 Mon Sep 17 00:00:00 2001 +From: John O'Sullivan <j.osullivan42@gmail.com> +Date: Thu, 18 Jan 2024 11:37:32 -0500 +Subject: [PATCH 1/8] WIP: Fixed 2.5 out of 5 issues mentioned in PR + +--- + snscrape/base.py             |  1 + + snscrape/modules/telegram.py | 12 ++++++------ + 2 files changed, 7 insertions(+), 6 deletions(-) + +diff --git a/snscrape/base.py b/snscrape/base.py +index c9e75d9d..5ce5e1da 100644 +--- a/snscrape/base.py ++++ b/snscrape/base.py +@@ -193,6 +193,7 @@ def _request(self, method, url, params = None, data = None, headers = None, time + 			# The request is newly prepared on each retry because of potential cookie updates. + 			req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers)) + 			environmentSettings = self._session.merge_environment_settings(req.url, proxies, None, None, None) ++			_logger.info("Hey there, I'm in here") + 			_logger.info(f'Retrieving {req.url}') + 			_logger.debug(f'... with headers: {headers!r}') + 			if data: +diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py +index 4e977656..54345d96 100644 +--- a/snscrape/modules/telegram.py ++++ b/snscrape/modules/telegram.py +@@ -196,6 +196,7 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False): + 				} + 				timeTag = videoPlayer.find('time') + 				if timeTag is None: ++					_logger.warning(f'Could not find duration for video or GIF at {url}') + 					cls = Gif + 				else: + 					cls = Video +@@ -219,8 +220,6 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False): + 					else: + 						_logger.warning(f'Could not process link preview image on {url}') + 				linkPreview = LinkPreview(**kwargs) +-				if kwargs['href'] in outlinks: +-					outlinks.remove(kwargs['href']) +  + 			viewsSpan = post.find('span', class_ = 'tgme_widget_message_views') + 			views = None if viewsSpan is None else _parse_num(viewsSpan.text) +@@ -239,13 +238,14 @@ def get_items(self): + 			return + 		nextPageUrl = '' + 		while True: ++			print("About to yield from get_items") + 			yield from self._soup_to_items(soup, r.url) +-			try: +-				if soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True)['href'].split('/')[-1] == '1': ++			dateElt = soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True) ++			if dateElt and 'href' in dateElt.attrs: ++				urlPieces = dateElt['href'].split('/') ++				if urlPieces and urlPieces[-1] == '1': + 					# if message 1 is the first message in the page, terminate scraping + 					break +-			except: +-				pass + 			pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True}) + 			if not pageLink: + 				# some pages are missing a "tme_messages_more" tag, causing early termination + +From 670905fedb64656b94c6fb920c8628d318171b64 Mon Sep 17 00:00:00 2001 +From: John O'Sullivan <j.osullivan42@gmail.com> +Date: Thu, 18 Jan 2024 11:46:46 -0500 +Subject: [PATCH 2/8] Remove test log statement, add link to example GIF + +--- + snscrape/base.py             | 1 - + snscrape/modules/telegram.py | 2 +- + 2 files changed, 1 insertion(+), 2 deletions(-) + +diff --git a/snscrape/base.py b/snscrape/base.py +index 5ce5e1da..c9e75d9d 100644 +--- a/snscrape/base.py ++++ b/snscrape/base.py +@@ -193,7 +193,6 @@ def _request(self, method, url, params = None, data = None, headers = None, time + 			# The request is newly prepared on each retry because of potential cookie updates. + 			req = self._session.prepare_request(requests.Request(method, url, params = params, data = data, headers = headers)) + 			environmentSettings = self._session.merge_environment_settings(req.url, proxies, None, None, None) +-			_logger.info("Hey there, I'm in here") + 			_logger.info(f'Retrieving {req.url}') + 			_logger.debug(f'... with headers: {headers!r}') + 			if data: +diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py +index 54345d96..01e99318 100644 +--- a/snscrape/modules/telegram.py ++++ b/snscrape/modules/telegram.py +@@ -196,7 +196,7 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False): + 				} + 				timeTag = videoPlayer.find('time') + 				if timeTag is None: +-					_logger.warning(f'Could not find duration for video or GIF at {url}') ++					# Example of duration-less GIF: https://t.me/thisisatestchannel19451923/3 + 					cls = Gif + 				else: + 					cls = Video + +From 54df8832f5b5bc3af58c3faf953966a2070a834d Mon Sep 17 00:00:00 2001 +From: John O'Sullivan <j.osullivan42@gmail.com> +Date: Thu, 22 Feb 2024 01:06:04 -0500 +Subject: [PATCH 3/8] Added media processing into main link loop; using prev + tag to get page, rather than index math + +--- + snscrape/modules/telegram.py | 84 +++++++++++++++++++----------------- + 1 file changed, 44 insertions(+), 40 deletions(-) + +diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py +index 01e99318..b4f3d78e 100644 +--- a/snscrape/modules/telegram.py ++++ b/snscrape/modules/telegram.py +@@ -152,7 +152,7 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False): + 						imageUrls = _STYLE_MEDIA_URL_PATTERN.findall(style) + 						if len(imageUrls) == 1: + 							media.append(Photo(url = imageUrls[0])) +-						continue ++ + 				if _SINGLE_MEDIA_LINK_PATTERN.match(link['href']): + 					style = link.attrs.get('style', '') + 					imageUrls = _STYLE_MEDIA_URL_PATTERN.findall(style) +@@ -161,49 +161,23 @@ def _soup_to_items(self, soup, pageUrl, onlyUsername = False): + 						# resp = self._get(image[0]) + 						# encoded_string = base64.b64encode(resp.content) + 					# Individual photo or video link +-					continue ++ + 				if link.text.startswith('@'): + 					mentions.append(link.text.strip('@')) +-					continue ++ + 				if link.text.startswith('#'): + 					hashtags.append(link.text.strip('#')) +-					continue ++ ++				if 'tgme_widget_message_voice_player' in link.get('class', []): ++					media.append(_parse_voice_message(link)) ++					 ++				if 'tgme_widget_message_video_player' in link.get('class', []): ++					media.append(_parse_video_message(link)) ++ + 				href = urllib.parse.urljoin(pageUrl, link['href']) + 				if (href not in outlinks) and (href != rawUrl) and (href != forwardedUrl): + 					outlinks.append(href) +  +-			for voicePlayer in post.find_all('a', {'class': 'tgme_widget_message_voice_player'}): +-				audioUrl = voicePlayer.find('audio')['src'] +-				durationStr = voicePlayer.find('time').text +-				duration = _durationStrToSeconds(durationStr) +-				barHeights = [float(s['style'].split(':')[-1].strip(';%')) for s in voicePlayer.find('div', {'class': 'bar'}).find_all('s')] +- +-				media.append(VoiceMessage(url = audioUrl, duration = duration, bars = barHeights)) +- +-			for videoPlayer in post.find_all('a', {'class': 'tgme_widget_message_video_player'}): +-				iTag = videoPlayer.find('i') +-				if iTag is None: +-					videoUrl = None  +-					videoThumbnailUrl = None +-				else: +-					style = iTag['style'] +-					videoThumbnailUrl = _STYLE_MEDIA_URL_PATTERN.findall(style)[0] +-					videoTag = videoPlayer.find('video') +-					videoUrl = None if videoTag is None else videoTag['src'] +-				mKwargs = { +-					'thumbnailUrl': videoThumbnailUrl, +-					'url': videoUrl, +-				} +-				timeTag = videoPlayer.find('time') +-				if timeTag is None: +-					# Example of duration-less GIF: https://t.me/thisisatestchannel19451923/3 +-					cls = Gif +-				else: +-					cls = Video +-					durationStr = videoPlayer.find('time').text +-					mKwargs['duration'] = _durationStrToSeconds(durationStr) +-				media.append(cls(**mKwargs)) +- + 			linkPreview = None + 			if (linkPreviewA := post.find('a', class_ = 'tgme_widget_message_link_preview')): + 				kwargs = {} +@@ -250,10 +224,10 @@ def get_items(self): + 			if not pageLink: + 				# some pages are missing a "tme_messages_more" tag, causing early termination + 				if '=' not in nextPageUrl: +-					nextPageUrl =  soup.find('link', attrs = {'rel': 'canonical'}, href = True)['href'] +-				nextPostIndex = int(nextPageUrl.split('=')[-1]) - 20 ++					nextPageUrl = soup.find('link', attrs = {'rel': 'prev'}, href = True)['href'] ++				nextPostIndex = int(nextPageUrl.split('=')[-1]) + 				if nextPostIndex > 20: +-					pageLink = {'href': nextPageUrl.split('=')[0] + f'={nextPostIndex}'} ++					pageLink = {'href': nextPageUrl} + 				else: + 					break + 			nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href']) +@@ -333,4 +307,34 @@ def _telegramResponseOkCallback(r): + 	if r.status_code == 200: + 		return (True, None) + 	return (False, f'{r.status_code=}') +-	 +\ No newline at end of file ++	 ++def _parse_voice_message(voicePlayer): ++	audioUrl = voicePlayer.find('audio')['src'] ++	durationStr = voicePlayer.find('time').text ++	duration = _durationStrToSeconds(durationStr) ++	barHeights = [float(s['style'].split(':')[-1].strip(';%')) for s in voicePlayer.find('div', {'class': 'bar'}).find_all('s')] ++	return VoiceMessage(url = audioUrl, duration = duration, bars = barHeights) ++ ++def _parse_video_message(videoPlayer): ++	iTag = videoPlayer.find('i') ++	if iTag is None: ++		videoUrl = None  ++		videoThumbnailUrl = None ++	else: ++		style = iTag['style'] ++		videoThumbnailUrl = _STYLE_MEDIA_URL_PATTERN.findall(style)[0] ++		videoTag = videoPlayer.find('video') ++		videoUrl = None if videoTag is None else videoTag['src'] ++	mKwargs = { ++		'thumbnailUrl': videoThumbnailUrl, ++		'url': videoUrl, ++	} ++	timeTag = videoPlayer.find('time') ++	if timeTag is None: ++		# Example of duration-less GIF: https://t.me/thisisatestchannel19451923/3 ++		cls = Gif ++	else: ++		cls = Video ++		durationStr = videoPlayer.find('time').text ++		mKwargs['duration'] = _durationStrToSeconds(durationStr) ++	return cls(**mKwargs) +\ No newline at end of file + +From 2dfd1542f19bbadad603e00e61712943542fbfe1 Mon Sep 17 00:00:00 2001 +From: John O'Sullivan <j.osullivan42@gmail.com> +Date: Thu, 22 Feb 2024 01:07:46 -0500 +Subject: [PATCH 4/8] Forgot to remove a test log + +--- + snscrape/modules/telegram.py | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py +index b4f3d78e..8f6d18d7 100644 +--- a/snscrape/modules/telegram.py ++++ b/snscrape/modules/telegram.py +@@ -212,7 +212,6 @@ def get_items(self): + 			return + 		nextPageUrl = '' + 		while True: +-			print("About to yield from get_items") + 			yield from self._soup_to_items(soup, r.url) + 			dateElt = soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True) + 			if dateElt and 'href' in dateElt.attrs: + +From a93f6a3fad0d19209a49c7b730fea73659743774 Mon Sep 17 00:00:00 2001 +From: John O'Sullivan <j.osullivan42@gmail.com> +Date: Fri, 1 Mar 2024 12:51:26 -0500 +Subject: [PATCH 5/8] Applying trislee's suggested fix for getting nextPageUrl + +--- + snscrape/modules/telegram.py | 16 +++++----------- + 1 file changed, 5 insertions(+), 11 deletions(-) + +diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py +index 8f6d18d7..ac0feef8 100644 +--- a/snscrape/modules/telegram.py ++++ b/snscrape/modules/telegram.py +@@ -219,17 +219,11 @@ def get_items(self): + 				if urlPieces and urlPieces[-1] == '1': + 					# if message 1 is the first message in the page, terminate scraping + 					break +-			pageLink = soup.find('a', attrs = {'class': 'tme_messages_more', 'data-before': True}) +-			if not pageLink: +-				# some pages are missing a "tme_messages_more" tag, causing early termination +-				if '=' not in nextPageUrl: +-					nextPageUrl = soup.find('link', attrs = {'rel': 'prev'}, href = True)['href'] +-				nextPostIndex = int(nextPageUrl.split('=')[-1]) +-				if nextPostIndex > 20: +-					pageLink = {'href': nextPageUrl} +-				else: +-					break +-			nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href']) ++			if pageLink := soup.find('link', attrs = {'rel': 'prev'}, href = True): ++				nextPageUrl = urllib.parse.urljoin(r.url, pageLink['href']) ++			else: ++				nextPostIndex = int(soup.find('div', attrs = {'class': 'tgme_widget_message', 'data-post': True})["data-post"].split("/")[-1]) ++				nextPageUrl = urllib.parse.urljoin(r.url, r.url.split('?')[0] + f'?before={nextPostIndex}') + 			r = self._get(nextPageUrl, headers = self._headers, responseOkCallback = _telegramResponseOkCallback) + 			if r.status_code != 200: + 				raise snscrape.base.ScraperException(f'Got status code {r.status_code}') + +From a542aa57598f94f69fd7b69789e97045e92133da Mon Sep 17 00:00:00 2001 +From: John O'Sullivan <j.osullivan42@gmail.com> +Date: Thu, 14 Mar 2024 01:50:38 -0400 +Subject: [PATCH 6/8] Ensured termination on channels w/o an id=1 post, wrote + test cases to prevent regression + +--- + snscrape/modules/telegram.py | 87 +++++++++++++++++++++++++++++++++++- + 1 file changed, 86 insertions(+), 1 deletion(-) + +diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py +index ac0feef8..7a85cb58 100644 +--- a/snscrape/modules/telegram.py ++++ b/snscrape/modules/telegram.py +@@ -9,6 +9,8 @@ + import snscrape.base + import typing + import urllib.parse ++import unittest ++import threading +  + _logger = logging.getLogger(__name__) + _SINGLE_MEDIA_LINK_PATTERN = re.compile(r'^https://t\.me/[^/]+/\d+\?single$') +@@ -212,6 +214,8 @@ def get_items(self): + 			return + 		nextPageUrl = '' + 		while True: ++			if soup.find("div", class_ = "tme_no_messages_found"): ++				break + 			yield from self._soup_to_items(soup, r.url) + 			dateElt = soup.find('a', attrs = {'class': 'tgme_widget_message_date'}, href = True) + 			if dateElt and 'href' in dateElt.attrs: +@@ -330,4 +334,85 @@ def _parse_video_message(videoPlayer): + 		cls = Video + 		durationStr = videoPlayer.find('time').text + 		mKwargs['duration'] = _durationStrToSeconds(durationStr) +-	return cls(**mKwargs) +\ No newline at end of file ++	return cls(**mKwargs) ++ ++class TestTelegramChannelScraper(unittest.TestCase): ++ ++	@staticmethod ++	def execute_with_timeout(func, timeout=10): ++		""" ++		Executes a function in a separate thread and enforces a timeout. ++		If provided function throws an error, it's re-raised in main thread. ++		Used to detect infinite loops in finite time, works cross-platform. ++		 ++		:param func: The function to execute. This function should accept no arguments. ++		:param timeout: The timeout in seconds. ++		""" ++		exceptions=[] ++		def func_passing_exceptions(): ++			try: ++				func() ++			except Exception as e: ++				exceptions.append((e.__class__, e, e.__traceback__)) ++ ++		thread = threading.Thread(target=func_passing_exceptions) ++		thread.start() ++		thread.join(timeout=timeout) ++ ++		if exceptions: ++			exc_class, exc_instance, traceback = exceptions[0] ++			raise exc_class(exc_instance).with_traceback(traceback) ++		 ++		if thread.is_alive(): ++			raise TimeoutError(f"Function didn't complete within {timeout} seconds") ++ ++	def test_scraping_termination_missing_prev(self): ++		"""Test scraping always terminates, even if the page's prev link is missing.""" ++ ++		def scrape_two_pages(): ++			scraper = TelegramChannelScraper('WLM_USA_TEXAS?before=3766') ++			items = list() ++			num_items_on_page = 20 ++			for item in scraper.get_items(): ++				items.append(item) ++				if len(items) > 2 * num_items_on_page: ++					break ++		 ++		self.execute_with_timeout(scrape_two_pages) ++ ++	def test_scraping_termination_small_post_count(self): ++		"""Test scraping always terminates, even with small number of posts. This channel has only 28.""" ++ ++		def scrape_small_channel(): ++			scraper = TelegramChannelScraper('AKCPB') ++			items = list(scraper.get_items()) ++			return items ++		 ++		self.execute_with_timeout(scrape_small_channel) ++ ++	def test_scraping_termination_channels_without_post_id_one(self): ++		"""Test scraping gracefully handles channels missing a post where id=1.""" ++ ++		def scrape_empty_page(): ++			scraper = TelegramChannelScraper('BREAKDCODE?before=3') ++			for _ in scraper.get_items(): ++				pass ++		 ++		self.execute_with_timeout(scrape_empty_page) ++ ++	def test_media_order_preservation(self): ++		"""Test scraped media appears in the same order as in the post.""" ++		scraper = TelegramChannelScraper('nexta_live?before=43103') ++		item = next(scraper.get_items(), None) ++		self.assertIsNotNone(item, "Failed to scrape any posts.") ++		self.assertEqual(item.url, "https://t.me/s/nexta_live/43102") ++ ++		# Directly validate the types of the objects in the media array ++		expected_types = [Video, Photo, Video]  # Adjust based on expected types ++		actual_types = [type(media) for media in item.media] if item.media else [] ++		 ++		self.assertEqual(actual_types, expected_types, "Media did not appear in the expected order.") ++ ++ ++if __name__ == '__main__': ++	unittest.main() + +From 7d061cb5279e153f829340f848bc4ba01d716f26 Mon Sep 17 00:00:00 2001 +From: John O'Sullivan <j.osullivan42@gmail.com> +Date: Thu, 14 Mar 2024 01:55:16 -0400 +Subject: [PATCH 7/8] Add docstring saying suite should run by directly running + file + +--- + snscrape/modules/telegram.py | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py +index 7a85cb58..c6e0b0ee 100644 +--- a/snscrape/modules/telegram.py ++++ b/snscrape/modules/telegram.py +@@ -336,7 +336,9 @@ def _parse_video_message(videoPlayer): + 		mKwargs['duration'] = _durationStrToSeconds(durationStr) + 	return cls(**mKwargs) +  ++ + class TestTelegramChannelScraper(unittest.TestCase): ++	"""Run suite by directly calling this file.""" +  + 	@staticmethod + 	def execute_with_timeout(func, timeout=10): + +From 9309b1b01c6db15862809623e2c5adddecd894be Mon Sep 17 00:00:00 2001 +From: John O'Sullivan <j.osullivan42@gmail.com> +Date: Thu, 14 Mar 2024 02:00:50 -0400 +Subject: [PATCH 8/8] Correct some inaccurate test descriptions + +--- + snscrape/modules/telegram.py | 13 +++++++------ + 1 file changed, 7 insertions(+), 6 deletions(-) + +diff --git a/snscrape/modules/telegram.py b/snscrape/modules/telegram.py +index c6e0b0ee..dbf0f9b3 100644 +--- a/snscrape/modules/telegram.py ++++ b/snscrape/modules/telegram.py +@@ -338,7 +338,7 @@ def _parse_video_message(videoPlayer): +  +  + class TestTelegramChannelScraper(unittest.TestCase): +-	"""Run suite by directly calling this file.""" ++	"""Run suite by directly running this file.""" +  + 	@staticmethod + 	def execute_with_timeout(func, timeout=10): +@@ -383,7 +383,7 @@ def scrape_two_pages(): + 		self.execute_with_timeout(scrape_two_pages) +  + 	def test_scraping_termination_small_post_count(self): +-		"""Test scraping always terminates, even with small number of posts. This channel has only 28.""" ++		"""Test scraping always terminates, even with small number of posts. This channel's highest ID is 28.""" +  + 		def scrape_small_channel(): + 			scraper = TelegramChannelScraper('AKCPB') +@@ -392,8 +392,8 @@ def scrape_small_channel(): + 		 + 		self.execute_with_timeout(scrape_small_channel) +  +-	def test_scraping_termination_channels_without_post_id_one(self): +-		"""Test scraping gracefully handles channels missing a post where id=1.""" ++	def test_scraping_termination_pages_without_posts(self): ++		"""Test scraping gracefully handles pages without any posts.""" +  + 		def scrape_empty_page(): + 			scraper = TelegramChannelScraper('BREAKDCODE?before=3') +@@ -407,10 +407,11 @@ def test_media_order_preservation(self): + 		scraper = TelegramChannelScraper('nexta_live?before=43103') + 		item = next(scraper.get_items(), None) + 		self.assertIsNotNone(item, "Failed to scrape any posts.") ++ ++		# This particular post is known to include media [Video, Photo, Video] + 		self.assertEqual(item.url, "https://t.me/s/nexta_live/43102") +  +-		# Directly validate the types of the objects in the media array +-		expected_types = [Video, Photo, Video]  # Adjust based on expected types ++		expected_types = [Video, Photo, Video] + 		actual_types = [type(media) for media in item.media] if item.media else [] + 		 + 		self.assertEqual(actual_types, expected_types, "Media did not appear in the expected order.") | 
