Spaces:

ginigen
/

Today

Running

App Files Files Community

ginipick commited on Oct 10

Commit

6085299

verified ·

1 Parent(s): 04168dc

Create app.py

Browse files

Files changed (1) hide show

app.py +444 -0

app.py ADDED Viewed

	@@ -0,0 +1,444 @@

+# -*- coding: utf-8 -*-
+"""
+AI 뉴스 & 허깅페이스 트렌딩 분석 시스템
+- AI Times 뉴스 크롤링 및 카테고리 분류
+- 허깅페이스 모델/스페이스 트렌딩 정보 수집
+- Fireworks AI (Qwen) 를 통한 뉴스 분석
+- Brave Search를 통한 팩트 체크
+"""
+import requests
+from bs4 import BeautifulSoup
+import json
+from datetime import datetime
+from typing import List, Dict, Optional
+import time
+import re
+class AINewsAnalyzer:
+    def __init__(self, fireworks_api_key: str, brave_api_key: str):
+        """
+        Args:
+            fireworks_api_key: Fireworks AI API 키
+            brave_api_key: Brave Search API 키
+        """
+        self.fireworks_api_key = fireworks_api_key
+        self.brave_api_key = brave_api_key
+        # 뉴스 카테고리 정의
+        self.categories = {
+            "산업동향": ["산업", "기업", "투자", "인수", "파트너십", "시장"],
+            "기술혁신": ["기술", "모델", "알고리즘", "개발", "연구", "논문"],
+            "제품출시": ["출시", "공개", "발표", "서비스", "제품"],
+            "정책규제": ["규제", "정책", "법", "정부", "제재"],
+            "보안이슈": ["보안", "취약점", "해킹", "위험", "프라이버시"],
+        }
+        self.huggingface_data = {
+            "models": [],
+            "spaces": []
+        }
+        self.news_data = []
+    def fetch_aitimes_news(self, urls: List[str]) -> List[Dict]:
+        """AI Times 뉴스 크롤링"""
+        all_news = []
+        for url in urls:
+            try:
+                print(f"📰 뉴스 크롤링 중: {url}")
+                response = requests.get(url, headers={
+                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+                })
+                soup = BeautifulSoup(response.content, 'html.parser')
+                # 뉴스 기사 추출 (실제 구조에 맞게 조정 필요)
+                articles = []
+                # 제목과 링크가 있는 a 태그 찾기
+                for link in soup.find_all('a', href=True):
+                    if '/news/articleView.html' in link['href']:
+                        title = link.get_text(strip=True)
+                        article_url = link['href']
+                        if not article_url.startswith('http'):
+                            article_url = 'https://www.aitimes.com' + article_url
+                        # 날짜 추출 (형제 요소에서)
+                        date_text = ""
+                        parent = link.parent
+                        if parent:
+                            date_elem = parent.find(text=re.compile(r'\d{2}-\d{2}'))
+                            if date_elem:
+                                date_text = date_elem.strip()
+                        if title and len(title) > 10:
+                            articles.append({
+                                'title': title,
+                                'url': article_url,
+                                'date': date_text,
+                                'source': 'AI Times'
+                            })
+                all_news.extend(articles[:10])  # 상위 10개만
+                time.sleep(1)  # 크롤링 예의
+            except Exception as e:
+                print(f"❌ 크롤링 오류: {e}")
+        return all_news
+    def fetch_huggingface_trending(self) -> Dict:
+        """허깅페이스 트렌딩 모델 및 스페이스 수집"""
+        print("🤗 허깅페이스 트렌딩 정보 수집 중...")
+        # 모델 트렌딩
+        try:
+            models_url = "https://huggingface.co/api/models"
+            params = {
+                'sort': 'trending',
+                'limit': 30
+            }
+            response = requests.get(models_url, params=params, timeout=10)
+            if response.status_code == 200:
+                models = response.json()
+                for model in models[:30]:
+                    self.huggingface_data['models'].append({
+                        'name': model.get('id', 'Unknown'),
+                        'downloads': model.get('downloads', 0),
+                        'likes': model.get('likes', 0),
+                        'task': model.get('pipeline_tag', 'N/A'),
+                        'url': f"https://huggingface.co/{model.get('id', '')}"
+                    })
+                print(f"✅ {len(self.huggingface_data['models'])}개 트렌딩 모델 수집 완료")
+        except Exception as e:
+            print(f"❌ 모델 수집 오류: {e}")
+        # 스페이스 트렌딩 (웹 크롤링)
+        try:
+            spaces_url = "https://huggingface.co/spaces"
+            response = requests.get(spaces_url, headers={
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+            }, timeout=10)
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # 스페이스 링크 추출
+            space_count = 0
+            for link in soup.find_all('a', href=True):
+                if '/spaces/' in link['href'] and space_count < 30:
+                    space_name = link['href'].replace('/spaces/', '')
+                    if '/' in space_name and len(space_name) > 3:
+                        title = link.get_text(strip=True)
+                        if title:
+                            self.huggingface_data['spaces'].append({
+                                'name': space_name,
+                                'title': title[:100],
+                                'url': f"https://huggingface.co{link['href']}"
+                            })
+                            space_count += 1
+            print(f"✅ {len(self.huggingface_data['spaces'])}개 트렌딩 스페이스 수집 완료")
+        except Exception as e:
+            print(f"❌ 스페이스 수집 오류: {e}")
+        return self.huggingface_data
+    def categorize_news(self, news_list: List[Dict]) -> List[Dict]:
+        """뉴스 카테고리 분류"""
+        for news in news_list:
+            title = news['title'].lower()
+            news['category'] = "기타"
+            for category, keywords in self.categories.items():
+                if any(keyword in title for keyword in keywords):
+                    news['category'] = category
+                    break
+        return news_list
+    def analyze_with_qwen(self, text: str, instruction: str) -> str:
+        """Fireworks AI Qwen 모델을 사용한 분석"""
+        url = "https://api.fireworks.ai/inference/v1/chat/completions"
+        payload = {
+            "model": "accounts/fireworks/models/qwen3-235b-a22b-instruct-2507",
+            "max_tokens": 4096,
+            "top_p": 1,
+            "top_k": 40,
+            "presence_penalty": 0,
+            "frequency_penalty": 0,
+            "temperature": 0.6,
+            "messages": [
+                {
+                    "role": "system",
+                    "content": "당신은 AI 뉴스를 초등학생도 이해할 수 있게 쉽게 설명하는 전문가입니다."
+                },
+                {
+                    "role": "user",
+                    "content": f"{instruction}\n\n뉴스: {text}"
+                }
+            ]
+        }
+        headers = {
+            "Accept": "application/json",
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.fireworks_api_key}"
+        }
+        try:
+            response = requests.post(url, headers=headers, data=json.dumps(payload), timeout=30)
+            if response.status_code == 200:
+                result = response.json()
+                return result['choices'][0]['message']['content']
+            else:
+                return f"분석 실패 (상태 코드: {response.status_code})"
+        except Exception as e:
+            return f"분석 오류: {str(e)}"
+    def fact_check_with_brave(self, query: str) -> List[Dict]:
+        """Brave Search를 통한 팩트 체크"""
+        url = "https://api.search.brave.com/res/v1/web/search"
+        headers = {
+            "Accept": "application/json",
+            "X-Subscription-Token": self.brave_api_key
+        }
+        params = {
+            "q": query,
+            "count": 5,
+            "text_decorations": False,
+            "search_lang": "ko"
+        }
+        try:
+            response = requests.get(url, headers=headers, params=params, timeout=10)
+            if response.status_code == 200:
+                data = response.json()
+                results = []
+                if 'web' in data and 'results' in data['web']:
+                    for item in data['web']['results'][:3]:
+                        results.append({
+                            'title': item.get('title', ''),
+                            'description': item.get('description', ''),
+                            'url': item.get('url', '')
+                        })
+                return results
+            else:
+                return []
+        except Exception as e:
+            print(f"❌ Brave Search 오류: {e}")
+            return []
+    def generate_report(self, news_list: List[Dict], analyze_news: bool = True) -> str:
+        """종합 리포트 생성"""
+        report = []
+        report.append("=" * 80)
+        report.append("📊 AI 뉴스 & 허깅페이스 트렌딩 종합 리포트")
+        report.append(f"📅 생성일시: {datetime.now().strftime('%Y년 %m월 %d일 %H:%M')}")
+        report.append("=" * 80)
+        report.append("")
+        # 1. 카테고리별 뉴스 분석
+        report.append("📰 === AI TIMES 뉴스 분석 ===")
+        report.append("")
+        categorized_news = {}
+        for news in news_list:
+            category = news.get('category', '기타')
+            if category not in categorized_news:
+                categorized_news[category] = []
+            categorized_news[category].append(news)
+        for category, articles in categorized_news.items():
+            report.append(f"📌 [{category}] ({len(articles)}건)")
+            report.append("-" * 80)
+            for i, article in enumerate(articles[:5], 1):  # 카테고리당 5개만
+                report.append(f"{i}. {article['title']}")
+                report.append(f"   🔗 {article['url']}")
+                report.append(f"   📅 {article.get('date', 'N/A')}")
+                # LLM 분석 (선택적)
+                if analyze_news and i <= 2:  # 각 카테고리 상위 2개만 분석
+                    print(f"🤖 LLM 분석 중: {article['title'][:50]}...")
+                    instruction = """이 뉴스를 다음 형식으로 분석해주세요:
+1. 핵심 내용 (2-3문장, 초등학생 수준)
+2. 왜 중요한가? (1-2문장)
+3. 당신이 해야 할 행동 (1-2개 항목)
+간결하고 명확하게 작성해주세요."""
+                    analysis = self.analyze_with_qwen(article['title'], instruction)
+                    report.append(f"\n   🤖 AI 분석:")
+                    for line in analysis.split('\n'):
+                        if line.strip():
+                            report.append(f"      {line.strip()}")
+                    # 팩트 체크 (선택적)
+                    fact_check = self.fact_check_with_brave(article['title'][:100])
+                    if fact_check:
+                        report.append(f"\n   ✅ 팩트 체크 (Brave Search):")
+                        for fc in fact_check[:2]:
+                            report.append(f"      • {fc['title']}")
+                            report.append(f"        {fc['url']}")
+                    time.sleep(2)  # API 레이트 리밋 고려
+                report.append("")
+            report.append("")
+        # 2. 허깅페이스 트렌딩
+        report.append("🤗 === 허깅페이스 트렌딩 TOP 30 ===")
+        report.append("")
+        # 모델
+        report.append("🔥 트렌딩 모델 TOP 30")
+        report.append("-" * 80)
+        for i, model in enumerate(self.huggingface_data['models'][:30], 1):
+            report.append(f"{i:2d}. {model['name']}")
+            report.append(f"    📊 다운로드: {model['downloads']:,} | ❤️ 좋아요: {model['likes']:,}")
+            report.append(f"    🏷️  Task: {model['task']}")
+            report.append(f"    🔗 {model['url']}")
+            report.append("")
+        report.append("")
+        # 스페이스
+        report.append("🚀 트렌딩 스페이스 TOP 30")
+        report.append("-" * 80)
+        for i, space in enumerate(self.huggingface_data['spaces'][:30], 1):
+            report.append(f"{i:2d}. {space['name']}")
+            report.append(f"    📝 {space['title']}")
+            report.append(f"    🔗 {space['url']}")
+            report.append("")
+        # 3. 종합 요약
+        report.append("=" * 80)
+        report.append("📈 종합 요약")
+        report.append("=" * 80)
+        report.append(f"• 총 뉴스 수집: {len(news_list)}건")
+        report.append(f"• 카테고리 수: {len(categorized_news)}개")
+        report.append(f"• 트렌딩 모델: {len(self.huggingface_data['models'])}개")
+        report.append(f"• 트렌딩 스페이스: {len(self.huggingface_data['spaces'])}개")
+        report.append("")
+        return '\n'.join(report)
+    def run_full_analysis(self, news_urls: List[str], analyze_with_llm: bool = True) -> str:
+        """전체 분석 실행"""
+        print("🚀 AI 뉴스 & 허깅페이스 트렌딩 분석 시작...")
+        print("")
+        # 1. 뉴스 수집
+        news_list = self.fetch_aitimes_news(news_urls)
+        print(f"✅ 총 {len(news_list)}건의 뉴스 수집 완료")
+        print("")
+        # 2. 뉴스 카테고리 분류
+        categorized_news = self.categorize_news(news_list)
+        print("✅ 뉴스 카테고리 분류 완료")
+        print("")
+        # 3. 허깅페이스 트렌딩 수집
+        self.fetch_huggingface_trending()
+        print("")
+        # 4. 리포트 생성
+        print("📝 리포트 생성 중...")
+        report = self.generate_report(categorized_news, analyze_news=analyze_with_llm)
+        print("")
+        print("✅ 분석 완료!")
+        return report
+    def save_report(self, report: str, filename: str = None):
+        """리포트 저장"""
+        if filename is None:
+            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+            filename = f"ai_news_report_{timestamp}.txt"
+        with open(filename, 'w', encoding='utf-8') as f:
+            f.write(report)
+        print(f"💾 리포트 저장 완료: {filename}")
+# ==================== 사용 예시 ====================
+def main():
+    """메인 실행 함수"""
+    # API 키 설정
+    FIREWORKS_API_KEY = "YOUR_FIREWORKS_API_KEY"  # 여기에 Fireworks API 키 입력
+    BRAVE_API_KEY = "YOUR_BRAVE_API_KEY"  # 여기에 Brave Search API 키 입력
+    # AI Times 뉴스 URL
+    news_urls = [
+        "https://www.aitimes.com/news/articleList.html?sc_multi_code=S2&view_type=sm",  # AI 산업
+        "https://www.aitimes.com/news/articleList.html?sc_section_code=S1N24&view_type=sm"  # AI 기술
+    ]
+    # 분석기 초기화
+    analyzer = AINewsAnalyzer(
+        fireworks_api_key=FIREWORKS_API_KEY,
+        brave_api_key=BRAVE_API_KEY
+    )
+    # 전체 분석 실행
+    # analyze_with_llm=False로 설정하면 LLM 분석 없이 빠르게 수집만 함
+    report = analyzer.run_full_analysis(
+        news_urls=news_urls,
+        analyze_with_llm=True  # LLM 분석 활성화 (시간이 오래 걸림)
+    )
+    # 결과 출력
+    print("\n" + "=" * 80)
+    print(report)
+    # 파일 저장
+    analyzer.save_report(report)
+if __name__ == "__main__":
+    main()
+# ==================== 사용 팁 ====================
+"""
+1. API 키 설정:
+   - Fireworks AI: https://fireworks.ai/
+   - Brave Search: https://brave.com/search/api/
+2. 빠른 테스트 (LLM 분석 없이):
+   analyzer.run_full_analysis(news_urls, analyze_with_llm=False)
+3. 특정 카테고리만 분석:
+   categorized_news에서 원하는 카테고리 필터링
+4. 크롤링 주기 조정:
+   time.sleep() 값을 조정하여 속도/안정성 균형
+5. 결과 활용:
+   - JSON으로 저장: json.dumps(analyzer.huggingface_data)
+   - 데이터베이스 저장
+   - 대시보드 연동
+"""