In this article, we create a process that fetches papers from arXiv daily, generates summaries using generative AI, and posts them to Slack.
Python Code for the Process
Below is the Python code that executes the process:
import datetime
import logging
import os
import time
import arxiv
import google.generativeai as genai
from slack_sdk import WebClient
from slack_sdk.errors import SlackApiError
PAPER_TYPE_LIST = ["cs.AI", "cs.CY", "cs.MA"]
ARXIV_API_URL = "http://export.arxiv.org/api/query"
GEMINI_API_KEY = os.environ["GEMINI_API_KEY"]
GEMINI_MODEL = "gemini-2.0-flash"
SLACK_TOKEN = os.environ["SLACK_BOT_TOKEN"] # Slack Bot token (xoxb-...)
SLACK_BOT_TOKEN = os.environ["SLACK_BOT_TOKEN"]
SLACK_CHANNEL = os.environ["SLACK_CHANNEL"] # ID of the channel to post to (e.g., "C12345...")
MAX_RESULTS = 30
def get_papers(max_results: int = 10):
# Create a query by combining the paper categories
query = " OR ".join([f"cat:{paper_type}" for paper_type in PAPER_TYPE_LIST])
# Removed date range query
client = arxiv.Client()
search = arxiv.Search(
query=query,
max_results=max_results,
sort_by=arxiv.SortCriterion.SubmittedDate,
sort_order=arxiv.SortOrder.Descending,
)
result_list = list(client.results(search))
if not result_list:
return result_list
# Determine the latest published date from the results.
latest_published = result_list[0].published
threshold = latest_published - datetime.timedelta(hours=24)
# Filter results to include only papers published within 24 hours of the latest result.
filtered_results = [paper for paper in result_list if paper.published >= threshold]
filtered_results = [
{
"title": paper.title,
"summary": paper.summary,
"pdf_url": paper.pdf_url,
"published": paper.published,
}
for paper in filtered_results]
return filtered_results
def generate_summary(abstract_text):
genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel(GEMINI_MODEL) # Model to use (e.g., Gemini 1.5 Flash version)
prompt = (
"Please summarize the following paper abstract in a clear and easy-to-understand way for beginners, "
"in English and within 300 characters.\n"
"Also include the significance and results of the paper.\n"
"Output only the summary result."
"\n---\n\n"
f"{abstract_text}"
)
response = model.generate_content(prompt)
summary_text = response.text.strip()
return summary_text
def send_slack_message(papers):
all_messages = []
for i, paper in enumerate(papers, 1):
title = paper["title"]
summary_text = paper["summary"]
link = paper["pdf_link"]
published = paper["published"]
message_text = (
f"{i}: *{title}*\n\n"
f"{summary_text}\n\n"
f"PDF: {link}\n"
f"Published: {published}"
)
all_messages.append(message_text)
all_message = "\n\n────────────────────────\n\n".join(all_messages)
client = WebClient(token=SLACK_BOT_TOKEN)
logger = logging.getLogger(__name__)
# ID of the channel to which you want to send the message
try:
# Call the chat.postMessage method using the WebClient
result = client.chat_postMessage(
channel=SLACK_CHANNEL,
text=all_message
)
logger.info(result)
except SlackApiError as e:
logger.error(f"Error posting message: {e}")
def main():
papers = get_papers(MAX_RESULTS)
output_papers = []
for i, paper in enumerate(papers, 1):
title = paper["title"]
abstract = paper["summary"]
link = paper["pdf_url"]
published = paper["published"]
summary = generate_summary(abstract)
output_papers.append({
"title": title,
"summary": summary,
"pdf_link": link,
"published": published
})
time.sleep(1)
send_slack_message(output_papers)
if __name__ == "__main__":
main()
Using Gemini for Summarization
In this implementation, we use Gemini. Please set your API key in an environment variable.
- Set
SLACK_BOT_TOKEN
to your Bot User OAuth Token (which requires thechat:write
permission). - Set
SLACK_CHANNEL
to the ID of the channel where you want to post the messages.
Running the Process on AWS Lambda
We will use AWS Lambda to run this process on a schedule.
- Set up a Lambda function using Python 3.12.
- Include external libraries via a ZIP file layer.
Local Setup Steps
- Set Python 3.12 using pyenv.
- Install the required libraries into a target folder:
pip install arxiv google-generativeai slack_sdk -t {folder}
- Create a ZIP archive of the folder:
zip -r ./upload.zip ./{folder}/*
- Upload the ZIP file to an AWS Lambda layer. Set the architecture to
x86_64
and the runtime to Python 3.12, then create the layer.
Modifying the Code for AWS Lambda
To run the process on AWS Lambda, modify the code by removing the if __name__ == "__main__":
block and adding a lambda_handler
function. The complete code becomes:
import datetime
import logging
import os
import time
import arxiv
import google.generativeai as genai
from slack_sdk import WebClient
from slack_sdk.errors import SlackApiError
PAPER_TYPE_LIST = ["cs.AI", "cs.CY", "cs.MA"]
ARXIV_API_URL = "http://export.arxiv.org/api/query"
GEMINI_API_KEY = os.environ["GEMINI_API_KEY"]
GEMINI_MODEL = "gemini-2.0-flash"
SLACK_TOKEN = os.environ["SLACK_BOT_TOKEN"] # Slack Bot token (xoxb-...)
SLACK_BOT_TOKEN = os.environ["SLACK_BOT_TOKEN"]
SLACK_CHANNEL = os.environ["SLACK_CHANNEL"] # ID of the channel to post to (e.g., "C12345...")
MAX_RESULTS = 30
def get_papers(max_results: int = 10):
# Create a query by combining the paper categories
query = " OR ".join([f"cat:{paper_type}" for paper_type in PAPER_TYPE_LIST])
# Removed date range query
client = arxiv.Client()
search = arxiv.Search(
query=query,
max_results=max_results,
sort_by=arxiv.SortCriterion.SubmittedDate,
sort_order=arxiv.SortOrder.Descending,
)
result_list = list(client.results(search))
if not result_list:
return result_list
# Determine the latest published date from the results.
latest_published = result_list[0].published
threshold = latest_published - datetime.timedelta(hours=24)
# Filter results to include only papers published within 24 hours of the latest result.
filtered_results = [paper for paper in result_list if paper.published >= threshold]
filtered_results = [
{
"title": paper.title,
"summary": paper.summary,
"pdf_url": paper.pdf_url,
"published": paper.published,
}
for paper in filtered_results]
return filtered_results
def generate_summary(abstract_text):
genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel(GEMINI_MODEL) # Model to use (e.g., Gemini 1.5 Flash version)
prompt = (
"Please summarize the following paper abstract in a clear and easy-to-understand way for beginners, "
"in English and within 300 characters.\n"
"Also include the significance and results of the paper.\n"
"Output only the summary result."
"\n---\n\n"
f"{abstract_text}"
)
response = model.generate_content(prompt)
summary_text = response.text.strip()
return summary_text
def send_slack_message(papers):
all_messages = []
for i, paper in enumerate(papers, 1):
title = paper["title"]
summary_text = paper["summary"]
link = paper["pdf_url"]
published = paper["published"]
message_text = (
f"{i}: *{title}*\n\n"
f"{summary_text}\n\n"
f"PDF: {link}\n"
f"Published: {published}"
)
all_messages.append(message_text)
all_message = "\n\n────────────────────────\n\n".join(all_messages)
client = WebClient(token=SLACK_BOT_TOKEN)
logger = logging.getLogger(__name__)
# ID of the channel to which you want to send the message
try:
# Call the chat.postMessage method using the WebClient
result = client.chat_postMessage(
channel=SLACK_CHANNEL,
text=all_message
)
logger.info(result)
except SlackApiError as e:
logger.error(f"Error posting message: {e}")
def main():
papers = get_papers(MAX_RESULTS)
output_papers = []
for i, paper in enumerate(papers, 1):
title = paper["title"]
abstract = paper["summary"]
link = paper["pdf_url"]
published = paper["published"]
summary = generate_summary(abstract)
output_papers.append({
"title": title,
"summary": summary,
"pdf_link": link,
"published": published
})
time.sleep(1)
send_slack_message(output_papers)
def lambda_handler(event, context):
main()
return {
'statusCode': 200,
'body': "Success in sending message to Slack!"
}
Scheduling the Lambda Function with Amazon EventBridge
To run the process at regular intervals, use Amazon EventBridge. Choose scheduled execution and configure the cron expression. For example, to execute the function every day at 6:30 AM, use the following cron expression:
cron(30 6 * * ? *)
Note: The time is in UTC. If you need to use your local time, adjust the schedule accordingly.
Set the Lambda function as the target and specify the function you created.
With this setup, every day at the specified time, the papers will be summarized and the summary will be posted to Slack.
Top comments (0)