@unpublished{sailor2report,
title = {{Sailor2: Sailing in South-East Asia with Inclusive Multilingual LLM}},
author = {Dou, Longxu and Liu, Qian and Zhou, Fan and Chen, Changyu and Wang, Zili and Jin, Ziqi and Liu, Zichen and Zhu, Tongyao and Du, Cunxiao and Yang, Penghui and Wang, Haonan and Liu, Jiaheng and Zhao, Yongchi and Feng, Xiachong and Mao, Xin and Yeung, Man Tsung and Pipatanakul, Kunat and Koto, Fajri and Thu, Min Si and Kydl{\'\i}{\v{c}}ek, Hynek and Liu, Zeyi and Lin, Qunshu and Sripaisarnmongkol, Sittipong and Sae-Khow, Kridtaphad and Thongchim, Nirattisai and Konkaew, Taechawat and Borijindargoon, Narong and Dao, Anh and Maneegard, Matichon and Artkaew, Phakphum and Yong, Zheng-Xin and Nguyen, Quan and Phatthiyaphaibun, Wannaphong and Tran, Hoang H. and Zhang, Mike and Chen, Shiqi and Pang, Tianyu and Du, Chao and Wan, Xinyi and Lu, Wei and Lin, Min},
year = {2025},
eprint = {2502.12982},
archiveprefix = {arXiv},
primaryclass = {cs.CL},
url = {https://arxiv.org/abs/2502.12982},
note = {arXiv:2502.12982},
arxiv = {2502.12982}
}
Sailor2 is a family of cutting-edge multilingual language models for South-East Asian (SEA) languages, available in 1B, 8B, and 20B sizes to suit diverse applications. Building on Qwen2.5, Sailor2 undergoes continuous pre-training on 500B tokens (400B SEA-specific and 100B replay tokens) to support 13 SEA languages while retaining proficiency in Chinese and English. Sailor2-20B model achieves a 50-50 win rate against GPT-4o across SEA languages. We also deliver a comprehensive cookbook on how to develop the multilingual model in an efficient manner, including five key aspects: data curation, pre-training, post-training, model customization and evaluation. We hope that Sailor2 model (Apache 2.0 license) will drive language development in the SEA region, and Sailor2 cookbook will inspire researchers to build more inclusive LLMs for other under-served languages.
@unpublished{chen2025assesshumanagentinteractionscase,
title = {{How can we assess human-agent interactions? Case studies in software agent design}},
author = {Chen, Valerie and Malhotra, Rohit and Wang, Xingyao and Michelini, Juan and Zhou, Xuhui and Soni, Aditya Bharat and Tran, Hoang H. and Smith, Calvin and Talwalkar, Ameet and Neubig, Graham},
year = {2025},
eprint = {2510.09801},
archiveprefix = {arXiv},
primaryclass = {cs.AI},
url = {https://arxiv.org/abs/2510.09801},
note = {arXiv:2510.09801},
arxiv = {2510.09801}
}
LLM-powered agents are both a promising new technology and a source of complexity, where choices about models, tools, and prompting can affect their usefulness. While numerous benchmarks measure agent accuracy across domains, they mostly assume full automation, failing to represent the collaborative nature of real-world use cases. In this paper, we make two major steps towards the rigorous assessment of human-agent interactions. First, we propose PULSE, a framework for more efficient human-centric evaluation of agent designs, which comprises collecting user feedback, training an ML model to predict user satisfaction, and computing results by combining human satisfaction ratings with model-generated pseudo-labels. Second, we deploy the framework on a large-scale web platform built around the open-source software agent OpenHands, collecting in-the-wild usage data across over 15k users. We conduct case studies around how three agent design decisions – choice of LLM backbone, planning strategy, and memory mechanisms – impact developer satisfaction rates, yielding practical insights for software agent design. We also show how our framework can lead to more robust conclusions about agent design, reducing confidence intervals by 40% compared to a standard A/B test. Finally, we find substantial discrepancies between in-the-wild results and benchmark performance (e.g., the anti-correlation between results comparing claude-sonnet-4 and gpt-5), underscoring the limitations of benchmark-driven evaluation. Our findings provide guidance for evaluations of LLM agents with humans and identify opportunities for better agent designs.
@unpublished{wang2025openhandssoftwareagentsdk,
title = {{The OpenHands Software Agent SDK: A Composable and Extensible Foundation for Production Agents}},
author = {Wang, Xingyao and Rosenberg, Simon and Michelini, Juan and Smith, Calvin and Tran, Hoang and Nyst, Engel and Malhotra, Rohit and Zhou, Xuhui and Chen, Valerie and Brennan, Robert and Neubig, Graham},
year = {2025},
eprint = {2511.03690},
archiveprefix = {arXiv},
primaryclass = {cs.SE},
url = {https://arxiv.org/abs/2511.03690},
note = {arXiv:2511.03690},
arxiv = {2511.03690}
}
Agents are now used widely in the process of software development, but building production-ready software engineering agents is a complex task. Deploying software agents effectively requires flexibility in implementation and experimentation, reliable and secure execution, and interfaces for users to interact with agents. In this paper, we present the OpenHands Software Agent SDK, a toolkit for implementing software development agents that satisfy these desiderata. This toolkit is a complete architectural redesign of the agent components of the popular OpenHands framework for software development agents, which has 64k+ GitHub stars. To achieve flexibility, we design a simple interface for implementing agents that requires only a few lines of code in the default case, but is easily extensible to more complex, full-featured agents with features such as custom tools, memory management, and more. For security and reliability, it delivers seamless local-to-remote execution portability, integrated REST/WebSocket services. For interaction with human users, it can connect directly to a variety of interfaces, such as visual workspaces (VS Code, VNC, browser), command-line interfaces, and APIs. Compared with existing SDKs from OpenAI, Claude, and Google, OpenHands uniquely integrates native sandboxed execution, lifecycle control, model-agnostic multi-LLM routing, and built-in security analysis. Empirical results on SWE-Bench Verified and GAIA benchmarks demonstrate strong performance. Put together, these elements allow the OpenHands Software Agent SDK to provide a practical foundation for prototyping, unlocking new classes of custom applications, and reliably deploying agents at scale.
@unpublished{openhands,
title = {{OpenHands: An Open Platform for AI Software Developers as Generalist Agents}},
author = {Wang, Xingyao and Li, Boxuan and Song, Yufan and Xu, Frank F. and Tang, Xiangru and Zhuge, Mingchen and Pan, Jiayi and Song, Yueqi and Li, Bowen and Singh, Jaskirat and Tran, Hoang H. and Li, Fuqiang and Ma, Ren and Zheng, Mingzhang and Qian, Bill and Shao, Yanjun and Muennighoff, Niklas and Zhang, Yizhe and Hui, Binyuan and Lin, Junyang and Brennan, Robert and Peng, Hao and Ji, Heng and Neubig, Graham},
year = {2024},
eprint = {2407.16741},
archiveprefix = {arXiv},
primaryclass = {cs.SE},
url = {https://arxiv.org/abs/2407.16741},
note = {arXiv:2407.16741},
arxiv = {2407.16741}
}
Software is one of the most powerful tools that we humans have at our disposal; it
allows a skilled programmer to interact with the world in complex and profound ways. At the
same time, thanks to improvements in large language models (LLMs), there has also been a rapid
development in AI agents that interact with and affect change in their surrounding
environments. In this paper, we introduce OpenHands (f.k.a. OpenDevin), a platform for the
development of powerful and flexible AI agents that interact with the world in similar ways to
those of a human developer: by writing code, interacting with a command line, and browsing the
web. We describe how the platform allows for the implementation of new agents, safe
interaction with sandboxed environments for code execution, coordination between multiple
agents, and incorporation of evaluation benchmarks. Based on our currently incorporated
benchmarks, we perform an evaluation of agents over 15 challenging tasks, including software
engineering (e.g., SWE-BENCH) and web browsing (e.g., WEBARENA), among others. Released under
the permissive MIT license, OpenHands is a community project spanning academia and industry
with more than 2.1K contributions from over 188 contributors.
Hoang H. Tran
M.Sc. Student @ VinUniversity
Hanoi / Ho Chi Minh City
Vietnam
© 2026 Hoang H. Tran