index.html

<!doctype html>
<html lang="en">
    <head>
        <title>V-IRL: Grounding Virtual Intelligence in Real Life</title>
        <link rel="icon" type="image/x-icon" href="static/img/icons/earth_icon.png">

        <meta charset="utf-8">
        <meta name="viewport" content="width=device-width, initial-scale=1">
        <meta property="og:url" content="https://virl-platform.github.io/" />
        <meta property="og:image" content="https://virl-platform.github.io/static/img/preview.png" />
        <meta property="og:title" content="V-IRL: Grounding Virtual Intelligence in Real Life" />
        <meta property="og:description" content="An open-source framework for embodied agent and open-world computer vision research. Develop practical agents and test foundation models grounded with real street view imagery from around the world." />

        <meta name="twitter:url" content="https://virl-platform.github.io/" />
        <meta name="twitter:card" content="summary_large_image" />
        <meta name="twitter:image" content="https://virl-platform.github.io/static/img/preview.png" />
        <meta name="twitter:title" content="V-IRL: Grounding Virtual Intelligence in Real Life" />
        <meta name="twitter:description" content="An open-source framework for embodied agent and open-world computer vision research. Develop practical agents and test foundation models grounded with real street view imagery from around the world." />

        <script src="./static/js/distill_template.v2.js"></script>


        <script src="https://d3js.org/d3.v5.min.js"></script>
        <script src="https://d3js.org/d3-collection.v1.min.js"></script>
        <script src="https://rawgit.com/nstrayer/slid3r/master/dist/slid3r.js"></script>

        <script defer="" src="./static/js/hider.js"></script>
        <script src="./static/js/image_interact.js"></script>
        <script src="./static/js/switch_videos.js"></script>

        <link rel="stylesheet" href="./static/css/style.css">
        <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
        <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">

        
        <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.10.2/dist/katex.min.css" integrity="sha384-yFRtMMDnQtDRO8rLpMIKrtPCD5jdktao2TV19YiZYWMDkUR5GQZR/NOVTdquEx1j" crossorigin="anonymous">
        <script defer src="https://cdn.jsdelivr.net/npm/katex@0.10.2/dist/katex.min.js" integrity="sha384-9Nhn55MVVN0/4OFx7EE5kpFBPsEMZxKTCnA+4fqDmg12eCTqGi6+BB2LjY8brQxJ" crossorigin="anonymous"></script>
        <script defer src="https://cdn.jsdelivr.net/npm/katex@0.10.2/dist/contrib/auto-render.min.js" integrity="sha384-kWPLUVMOks5AQFrykwIup5lo0m3iMkkHrD0uJ4H5cjeGihAutqP0yW0J6dpFiVkI" crossorigin="anonymous"
            onload="renderMathInElement(document.body);"></script>
        <script defer src="./static/js/fontawesome.all.min.js"></script>

        <!-- medium zoom https://github.com/francoischalifour/medium-zoom -->
        <script src="https://cdn.jsdelivr.net/npm/jquery@3.7.1/dist/jquery.min.js"></script>  <!-- jquery -->
        <script defer src="./static/js/medium-zoom.min.js"></script>
        <script defer src="./static/js/zoom.js"></script>
    </head>
    <body>
        <div class="header-wrapper">
            <div class="header-container" id="header-container">
                <div class="header-content">
                    <h1 style="margin-top: 0px">V-<i>IRL</i>: Grounding Virtual Intelligence in Real Life</h1>
                    <p style="color: #FFF7D4">
                        An open-source framework for 
                        <em><strong style="color: #ffe099">embodied agent</strong></em>
                        and 
                        <em><strong style="color: #ffe099">open-world computer vision</strong></em>
                        research.
                        Develop practical agents and test foundation models in virtual real world cities across the globe, grounded with <em><strong>real</strong></em> geospatial data and street view imagery.
                    </p>
                    <div class="button-container">
                        <a href="https://arxiv.org/abs/2402.03310" class="button paper-link" target="_blank">
                            <span class="icon is-small">
                                <i class="ai ai-arxiv"></i>
                            </span>
                            arXiv
                        </a>
                        <a href="./static/V-IRL.pdf" class="button paper-link" target="_blank">
                            <span class="icon is-small">
                                <i class="fas fa-file-pdf"></i>
                            </span>
                            <span>pdf</span>
                        </a>
                        <a href="https://github.com/VIRL-Platform/VIRL" class="button" target="_blank">
                            <span class="icon is-small">
                                <i class="fab fa-github"></i>
                            </span>
                            <span>Code</span>
                        </a>
                    </div>
                </div>
                <div class="header-image">
                    <img src="static/img/teaser_img_v3.jpg" alt="Teaser Image" class="teaser-image">
                </div>
            </div>
        </div>
    <d-article>
        <div class="byline">
            <div class="byline-container">
                <div class="byline-column">
                    <h3>Authors</h3>
                    <p><a href="https://jihanyang.github.io/" class="author-link" target="_blank">Jihan Yang</a> <sup>△</sup></p>
                    <p><a href="https://dingry.github.io/" class="author-link" target="_blank">Runyu Ding</a> <sup>△</sup></p>
                    <p><a href="https://ellisbrown.github.io/" class="author-link" target="_blank">Ellis Brown</a> <sup>▲</sup></p>
                    <p><a href="https://xjqi.github.io/" class="author-link" target="_blank">Xiaojuan Qi</a> <sup>△</sup></p>
                    <p><a href="https://www.sainingxie.com/" class="author-link" target="_blank">Saining Xie</a> <sup>▲</sup></p>
                </div>
                <div class="byline-column">
                    <h3>Affiliations</h3>
                    <p>
                        <sup>△</sup>
                        <a href="https://www.eee.hku.hk/" class="affiliation-link" target="_blank">University of Hong Kong</a>
                    </p>
                    <p>
                        <sup>▲</sup>
                        <a href="https://cs.nyu.edu/home/index.html" class="affiliation-link" target="_blank">New York University</a>
                    </p>
                </div>
                <div class="byline-column">
                    <h3>Date</h3>
                    <p>
                        Feb 5<sup>th</sup>, 2024
                    </p>
                </div>
            </div>
        </div>

        <div class="nav-bar" id="nav-bar">
            <a class="nav-link" href="#top" style="opacity: 0.7">
                <div style="margin: 8px 0px; text-align: center">
                    <span style="font-size: 30px;">&#128285;</span>
                </div>
            </a>
            <hr style="display: block; margin: auto;">
            <div class="geo-color">
                <a class="nav-link" href="#geo">
                    <img class="virl-tag" src="static/img/tags/geo.png">
                </a>
                <a class="nav-link" href="#peng"><img src="static/img/avatars/courier.png" title="Peng: visiting student">
                </a>
            </div>
            <hr style="display: block; margin: auto;">
            <div class="llm-color">
                <a class="nav-link" href="#language">
                    <img class="virl-tag" src="static/img/tags/lm.png">
                </a>
                <a class="nav-link" href="#aria"><img src="static/img/avatars/recommender.png" title="Aria: place recommender">
                </a>
                <a class="nav-link" href="#vivek"><img src="static/img/avatars/real_estate.png" title="Vivek: estate agent">
                </a>
            </div>
            <hr style="display: block; margin: auto;">
            <div class="cv-color">
                <a class="nav-link" href="#vision">
                    <img class="virl-tag" src="static/img/tags/cv.png">
                </a>
                <a class="nav-link" href="#rx-399"><img src="static/img/avatars/robot.png" title="RX-399: urban assistant robot">
                </a>
                <a class="nav-link" href="#imani"><img src="static/img/avatars/urban_planner.png" title="Imani: urban planner">
                </a>
                <a class="nav-link" href="#hiro"><img src="static/img/avatars/explorer.png" title="Hiro: explorer">
                </a>
            </div>
            <hr style="display: block; margin: auto;">
            <div class="col-color">
                <a class="nav-link" href="#collaboration">
                    <img class="virl-tag" src="static/img/tags/col.png">
                </a>
                <a class="nav-link" href="#ling">
                    <img src="static/img/avatars/tourist.png" title="Ling: tourist">
                </a>
                <a class="nav-link" href="#diego">
                    <img src="static/img/avatars/concierge.png" title="Diego: expert concierge">
                </a>
            </div>
            <hr style="display: block; margin: auto;">
            <div id="nav-bar-system">
                <a class="nav-link" href="#system"><img src="static/img/icons/system.png" title="System fundamentals"></a>
            </div>
            <hr style="display: block; margin: auto;">
            <div id="nav-bar-benchmark">
                <a class="nav-link" href="#benchmark"><img src="static/img/icons/benchmark.png" title="V-IRL Benchmark"></a>
            </div>
        </div>
        
        <div class="l-page video-container" style="margin-left: 4%; margin-bottom: 20px">
            <iframe width="560" height="315" src="https://www.youtube.com/embed/F8OYtifxfe8?si=mgddGW5uih500O_m" title="YouTube video player" frameborder="0" allow="autoplay; encrypted-media; picture-in-picture" allowfullscreen></iframe>
            <figcaption style="text-align: center">(Best viewed in 4K)</figcaption>
        </div>

        <p class="text abstract">
            There's a massive gap between the text-centric digital environments of current AI agents and the sensory-rich world we humans inhabit.
            To develop agents that can operate flexibly and reliably in real-world settings, we must bridge this gap and embody agents in an environment that <em>necessitates</em> the nuanced perceptual understanding required in the real world.
            Naturally, this problem has long been studied in robotics, with agents embodied physically in the world; however, the physical constraints and cost of real hardware prohibit scaling up agents and testing them in diverse environments beyond the lab.
            <br><br>
            To address this challenge, we introduce <strong>V-<i>IRL</i></strong>,
            a <em>scalable</em> platform enabling agents to interact
            with a <em>virtual facsimile</em> of the real world.
            Leveraging mapping, geospatial, and street view imagery APIs (see <a href="#system">&sect;System Fundamentals</a>), V-<i>IRL</i>
            embeds agents in real cities across the Earth.
            To showcase the capabilities our platform enables, in <a href="#agent-exemplars">&sect;Agent Exemplars</a>, we use V-<i>IRL</i> to instantiate a series of agents
            that solve various practical tasks, grounded with its sensory-rich perceptual and descriptive data.
            <br><br>
            Our platform also functions as a vast testbed for measuring progress in 
            open-world computer vision and embodied AI with unprecedented scale and diversity&mdash;providing structured access to
            <em>hundreds of billions of images</em> spanning the entire globe.
            <d-footnote>
                Google Street View alone has >220 billion images as of May 2022, and there are numerous other sources of imagery and data that can be incorporated to enrich the environment.
                <a href="https://blog.google/products/maps/street-view-15-new-features/" target="_blank">https://blog.google/products/maps/street-view-15-new-features/</a>
            </d-footnote>
            In <a href="#benchmark">&sect;V-<i>IRL</i> Benchmark</a>, we use V-<i>IRL</i> to construct an initial benchmark of "open-world" vision models on a <em>truly open-world</em> distribution.
        </p>
        <div class="l-page teaser-video">
            <video autoplay loop muted style="width:100%" preload="auto" playsinline>
                <source src="static/video/teaser_gif.mp4" type="video/mp4">
                Your browser does not support the video tag.
            </video>
        </div>

        <hr>

        <div id="agent-exemplars" class="agent-block">
            <h1 class="text">V-<i>IRL</i> Agent Exemplars</h1>
            <p class="text">
                To demonstrate the versatility of the <strong>V-<i>IRL</i></strong> platform, we use it to instantiate several exemplar agents virtually in real cities around the globe and engage them in various practical tasks.
                For illustration, we give V-<i>IRL</i> agents character metadata, including an 8-bit avatar, a name, a short bio, and an intention they are trying to accomplish.
                For a deeper dive into V-<i>IRL</i>'s components and the capabilities they enable, see <a href="#system">&sect;System Fundamentals</a>.
            </p><p class="text">
                Each subsequent agent and their task is designed to reveal a new capability of the platform.
                We highlight the specific V-<i>IRL</i> capabilities being employed throughout using tags and correspondingly colored sections:
            </p>
            <ul class="text">
                <li><img class="virl-tag" src="static/img/tags/geo.png"> Action & Geolocation/Mapping capabilities: <a href="#geo" class="geo-color">&sect;Earthbound Agents</a></li>
                <li><img class="virl-tag" src="static/img/tags/lm.png"> Reasoning & Language Models: <a href="#language" class="llm-color">&sect;Language-Driven Agents</a></li>
                <li><img class="virl-tag" src="static/img/tags/cv.png"> Perception & Computer Vision: <a href="#vision" class="cv-color">&sect;Visually Grounded Agents</a></li>
                <li><img class="virl-tag" src="static/img/tags/col.png"> Agent-{Agent, Human} Collaboration: <a href="#collaboration" class="col-color">&sect;Collaborative Agents</a></li>
            </ul>
            <p class="click-hint" style="width: 85%;"><strong><img src="static/img/icons/click.gif" style="width: 35px">
                Hover over each avatar to see more info. Click to jump to its section.
            </strong></p>
            <div class="avatar-row figure">
                <div class="avatar" onmouseover="showTakeaway('takeaway-peng')">
                    <a href="#peng">
                        <img src="static/img/avatars/courier.png" alt="Route optimizer Peng">
                    </a>
                    <figcaption>Peng</figcaption>
                </div>
                <div class="avatar">
                    <a href="#aria" onmouseover="showTakeaway('takeaway-aria')">
                        <img src="static/img/avatars/recommender.png" alt="Place recommender">
                    </a>
                    <figcaption>Aria</figcaption>
                </div>
                <div class="avatar">
                    <a href="#vivek" onmouseover="showTakeaway('takeaway-vivek')">
                    <img src="static/img/avatars/real_estate.png" alt="Estate recommender">
                    </a>
                    <figcaption>Vivek</figcaption>
                </div>
                <div class="avatar">
                    <a href="#rx-399" onmouseover="showTakeaway('takeaway-rx-399')">
                    <img src="static/img/avatars/robot.png" alt="Robot RX-399">
                    </a>
                    <figcaption>RX-399</figcaption>
                </div>
                <div class="avatar">
                    <a href="#imani" onmouseover="showTakeaway('takeaway-imani')">
                    <img src="static/img/avatars/urban_planner.png" alt="Urban planner">
                    </a>
                    <figcaption>Imani</figcaption>
                </div>
                <div class="avatar">
                    <a href="#hiro" onmouseover="showTakeaway('takeaway-hiro')">
                    <img src="static/img/avatars/explorer.png" alt="Intentional explorer">
                    </a>
                    <figcaption>Hiro</figcaption>
                </div>
                <div class="avatar">
                    <a href="#ling" onmouseover="showTakeaway('takeaway-ling')">
                    <img src="static/img/avatars/tourist.png" alt="Tourist">
                    </a>
                    <figcaption>Ling</figcaption>
                </div>
                <div class="avatar">
                    <a href="#diego" onmouseover="showTakeaway('takeaway-diego')">
                    <img src="static/img/avatars/concierge.png" alt="Concierge">
                    </a>
                    <figcaption>Diego</figcaption>
                </div>
            </div>

            <div class="exemplar-takeaways">
                <div class="takeaway-card" id="takeaway-peng">
                    <div class="takeaway-head">
                        <span>Peng: Takeaway</span>
                        <div class="takeaway-tags">
                            <img src="static/img/tags/geo.png">
                        </div>
                    </div>
                    <p class="takeaway-content">
                        V-<i>IRL</i> instantiates agents with real geospatial information, and enables useful tasks like route optimization.
                    </p>
                </div>
                <div class="takeaway-card" id="takeaway-aria">
                    <div class="takeaway-head">
                        <span>Aria: Takeaway</span>
                        <div class="takeaway-tags">
                            <img src="static/img/tags/geo.png">
                            <img src="static/img/tags/lm.png">
                        </div>
                    </div>
                    <p class="takeaway-content">
                        V-<i>IRL</i> exposes rich real-world information to agents that they can use for real-world tasks.
                    </p>
                </div>
                <div class="takeaway-card" id="takeaway-vivek">
                    <div class="takeaway-head">
                        <span>Vivek: Takeaway</span>
                        <div class="takeaway-tags">
                            <img src="static/img/tags/geo.png">
                            <img src="static/img/tags/lm.png">
                        </div>
                    </div>
                    <p class="takeaway-content">
                        Grounded in geographic coordinates, V-<i>IRL</i> agents can leverage arbitrary real-world information via APIs.
                    </p>
                </div>
                <div class="takeaway-card" id="takeaway-rx-399">
                    <div class="takeaway-head">
                        <span>RX-399: Takeaway</span>
                        <div class="takeaway-tags">
                            <img src="static/img/tags/geo.png">
                            <img src="static/img/tags/cv.png">
                        </div>
                    </div>
                    <p class="takeaway-content">
                        V-<i>IRL</i> agents can use perceptual input to understand and interact with their environment.
                    </p>
                </div>
                <div class="takeaway-card" id="takeaway-imani">
                    <div class="takeaway-head">
                        <span>Imani: Takeaway</span>
                        <div class="takeaway-tags">
                            <img src="static/img/tags/geo.png">
                            <img src="static/img/tags/cv.png">
                        </div>
                    </div>
                    <p class="takeaway-content">
                        V-<i>IRL</i> enables realistic open-world applications requiring vast geospatial and first-person visual information.
                    </p>
                </div>
                <div class="takeaway-card" id="takeaway-hiro">
                    <div class="takeaway-head">
                        <span>Hiro: Takeaway</span>
                        <div class="takeaway-tags">
                            <img src="static/img/tags/geo.png">
                            <img src="static/img/tags/lm.png">
                            <img src="static/img/tags/cv.png">
                        </div>
                    </div>
                    <p class="takeaway-content">
                        V-<i>IRL</i> agents can utilize visual detectors, VLMs and LLMs to iteratively perceive, decide, and interact in the environment.
                    </p>
                </div>
                <div class="takeaway-card" id="takeaway-ling">
                    <div class="takeaway-head">
                        <span>Ling: Takeaway</span>
                        <div class="takeaway-tags">
                            <img src="static/img/tags/geo.png">
                            <img src="static/img/tags/lm.png">
                            <img src="static/img/tags/cv.png">
                            <img src="static/img/tags/col.png">
                        </div>
                    </div>
                    <p class="takeaway-content">
                        V-<i>IRL</i> agents can collaborate to solve complex tasks that are beyond their individual expertise.
                    </p>
                </div>
                <div class="takeaway-card" id="takeaway-diego">
                    <div class="takeaway-head">
                        <span>Diego: Takeaway</span>
                        <div class="takeaway-tags">
                            <img src="static/img/tags/geo.png">
                            <img src="static/img/tags/lm.png">
                            <img src="static/img/tags/cv.png">
                            <img src="static/img/tags/col.png">
                        </div>
                    </div>
                    <p class="takeaway-content">
                        V-<i>IRL</i> agents can collaborate with users to solve complex tasks that require understanding the user's internal state.
                    </p>
                </div>
            </div>
        </div>

        <div id="geo">
            <h2 class="text"><img class="virl-tag" src="static/img/tags/geo.png"><br>Earthbound Agents</h2>
            <div class="l-screen grey-overlay"></div>
            <p class="text">
                Agents using the V-<i>IRL</i> platform inhabit virtual representations of real cities around the globe. At the core of this representation are <em>geographic coordinates</em> corresponding to points on the Earth's surface.
            </p>
            <div>
                <d-figure>
                    <img src="static/img/Latitude_and_Longitude_of_the_Earth.svg" alt="Latitude and Longitude of the Earth" data-zoomable style="max-width: 100%;">
                    <figcaption style="text-align: center">Geographic Coordinates: Latitude and Longitude of the Earth
                        <d-footnote>Figure source: <a href="https://commons.wikimedia.org/wiki/File:Latitude_and_Longitude_of_the_Earth.svg" target="_blank">Wikimedia Commons</a></d-footnote>
                    </figcaption>
                </d-figure>
            </div>
            <p class="text">
                With these geographic coordinates as a link between digital media and the real world,
                V-<i>IRL</i> agents <em>ground</em> themselves in the world using APIs for maps, <em>real</em> street view imagery, information about nearby destinations, and much more.
            </p>
        </div>
        
        <div id="peng" class="agent-block geo">
            <div class="l-screen grey-overlay"></div>
            <h3 class="text"><img style="width: 35px" src="static/img/avatars/courier.png"> Peng: Visiting Student</h3>
            
            <video class="auto-video l-page" muted autoplay preload="auto" playsinline>
                <source src="static/video/story/peng.mp4" type="video/mp4">
                Your browser does not support the video tag.
            </video>
            
            <p class="text">
                Peng needs to visit several locations throughout NYC to get documents signed for registration as a visiting student...
                Leveraging Geolocation & Mapping capabilities, Peng saves 7 minutes by walking along the shortest path as opposed to in order waypoint visitation.
            </p>
            
            <div>
                <d-figure>
                    <figure>
                        <img data-zoomable src="static/img/courier.jpg" alt="Route optimizer figure">
                        <figcaption style="text-align: center">Finding the shortest path for Peng to travel to five places.</figcaption>
                    </figure>
                </d-figure>
            </div>

            <div class="takeaway-card">
                <div class="takeaway-head">
                    <span>Takeaway</span>
                    <div class="takeaway-tags">
                        <img src="static/img/tags/geo.png">
                    </div>
                </div>
                <p class="takeaway-content">
                    V-<i>IRL</i> instantiates agents with real geospatial information, and enables useful tasks like route optimization.
                </p>
            </div>
        </div>
        
        <div id="language">
            <h2 class="text"><img class="virl-tag" src="static/img/tags/lm.png"><br>Language-Driven Agents</h2>
            <div class="l-screen grey-overlay"></div>
            <p class="text">
                To tackle more complex tasks, we follow the pattern of language-driven agents <d-cite key="xi2023rise"></d-cite>. LLMs enable agents to reason, plan, and use external tools & APIs.
            </p>
        </div>

        <!-- place recommender agent -->
        
        <div id="aria" class="agent-block language">
        <h3 class="text" id="aria" style="margin-top: 40px;"><img style="width: 35px" src="static/img/avatars/recommender.png"> Aria: Place Recommender</h3>
        <div class="l-screen grey-overlay"></div>
        
        <video class="auto-video" muted autoplay preload="auto" playsinline>
            <source src="static/video/story/aria.mp4" type="video/mp4">
            Your browser does not support the video tag.
        </video>
        <p class="text">
            Aria searches for possible restaurants nearby.
            She then synthesizes public reviews to make final recommendations via GPT-4. As Peng is new to the city and originally from Sichuan, she recommends the spicy Chinese joint <em>Chow House 粤德轩</em> to give him a taste of home.
        </p>
        
        <!-- TODO: optimize mobile view -->
        <div class="interactive-image">
            <div id="image-text-container">
                <p x="165" y="-12.5" text-anchor="left" style="font-weight: 700; font-size: 15px; font-family: sans-serif;">Click to check different candidate places:</p>
                <div class="button-bar">
                    <!-- Buttons to interact with the image and text -->
                    <button onclick="changeContent('Place1')">Chow House</button>
                    <button onclick="changeContent('Place2')">Kwa Food Fried Skewers</button>
                    <button onclick="changeContent('Place3')">Tartinery Cafe</button>
                    <button onclick="changeContent('Place4')">Sushi Zo</button>
                    <button onclick="changeContent('Place5')">Dos Toros Taqueria</button>
                </div>
                <div class="image-review-container" style="margin-bottom: 0px;">
                    <img data-zoomable style="width: 55%;" src="static/img/place_recommend/place1.jpg" alt="place illustration">
                    <div class="text-box">
                        <p style="font-size: 14px; margin-bottom: 0px;"><strong>Example Place Review:</strong></p>
                        <blockquote class="place-review">Well done, Chow House. Authentic cuisine, expertly prepared.  Twice Cooked pork, done the right way. You can see the leeks from the photo with none of that Americanized garbage many places dump into this otherwise elegant dish... Big shrimp.  Big flavor.  Peanuts.  Initially, I was scared of the deep fried dried red pepper, but they turned out really tasty and crunchy, not as spicy as I had originally feared. Pork fried rice, expertly cooked.  All in all, if you live downtown and like authentic Sichuan food, this is your place.  Bravo! (rating: 5)</blockquote>
                        <blockquote class="place-review">Amazing Asian grill! Great taste, big variety, good prices... They have plenty of options vegan and non-vegan. The skewers are super affordable, fun to eat and taste great. Prices range from 50cents - 4$ per skewer,  depending in what skewers, with most being around 2$. They deep fry the skewers right there for you and add salt and other spices (let them know if you don't like your food too salty). Highly recommended:).(rating: 5)</blockquote>
                        <blockquote class="place-review">I was so impressed with my brunch. I tried the French toast, my boyfriend  got the Eggs Benedict and our friend got the burger. The quality of the food is good and it takes a short amount of time to receive it. The staff was sweet and helpful. I will definitely come back. (rating: 5)</blockquote>
                        <blockquote class="place-review">You must have to love sushi if you plan on dining here. It’s $299/person and omakase only... The sushi is flown in daily from Japan. It is definitely a dining experience for taste, texture and art of the making and presentation of each sushi. With a few glasses of wine and bottled water as well as tip was about $850.  It’s quite expensive but I feel that the experience and quality of the food is worth it. It’s perfect for a special occasion. The sushi was the freshest sushi I’ve ever ate in my life. (rating: 5)</blockquote>
                        <blockquote class="place-review">Very sad burrito for the price point. Very small small for the price point as you can see it is no bigger than my friends arm. There were things missing from the burrito such as pico and guac. Each bite I took I regretted. As a vegan I would say it’s better for your wallet and your stomach to go to chipotle. (rating: 2)</blockquote>
                    </div>
                </div>
                <p style="text-align: left; font-size: 15px; margin-bottom: 0px"><strong>Agent Consideration:</strong></p>
                <blockquote style="font-size:14px">Chow House is a highly recommended Sichuan restaurant, which aligns with Peng's background as he grew up in Sichuan. The restaurant offers authentic Sichuan food, which Peng might be familiar with and enjoy. The restaurant also has good seating, decoration, and friendly service, which would make for a pleasant dining experience. However, some dishes received mixed reviews, which is why the rating is not a perfect 10.</blockquote>
            </div>
        </div>

        <div class="takeaway-card">
            <div class="takeaway-head">
                <span>Takeaway</span>
                <div class="takeaway-tags">
                    <img src="static/img/tags/geo.png">
                    <img src="static/img/tags/lm.png">
                </div>
            </div>
            <p class="takeaway-content">
                V-<i>IRL</i> exposes rich real-world information to agents that they can use for real-world tasks.
            </p>
        </div>
        </div>
        
        <!-- Real estate agent -->
        <div id="vivek" class="agent-block language">
            <div class="l-screen grey-overlay"></div>
            <h3 class="text"><img style="width: 35px" src="static/img/avatars/real_estate.png"> Vivek: Estate Agent</h3>
            <video class="auto-video" muted autoplay preload="auto" playsinline>
                <source src="static/video/story/vivek.mp4" type="video/mp4">
                Your browser does not support the video tag.
            </video>
            <p class="text">
                Vivek uses real estate APIs to find potential apartments in Peng's desired regions and price range.
                For each candidate, he researches its proximity to the places Peng cares about. Synthesizing these factors, Vivek provides a holistic rating and accompanying reasoning using GPT-4.
                His top recommendation is a cost-effective 1 bedroom apartment for $1986/mo, which is close to a supermarket, 2 bus stations, and a gym.
            </p>

            <div>
                <d-figure>
                    <figure>
                        <img data-zoomable src="static/img/agent_estate.jpg" alt="Estate recommender">
                        <figcaption style="text-align: center">Part of candidate estates.</figcaption>
                    </figure>
                </d-figure>
            </div>

            <div class="takeaway-card">
                <div class="takeaway-head">
                    <span>Takeaway</span>
                    <div class="takeaway-tags">
                        <img src="static/img/tags/geo.png">
                        <img src="static/img/tags/lm.png">
                    </div>
                </div>
                <p class="takeaway-content">
                    Grounded in geographic coordinates, V-<i>IRL</i> agents can leverage arbitrary real-world information via APIs.
                </p>
            </div>
        </div>

        <!-- Visual agents -->

        <div id="vision">
            <h2 class="text"><img class="virl-tag" src="static/img/tags/cv.png"><br>Visually Grounded Agents</h2>
            <div class="l-screen grey-overlay"></div>
            <p class="text">
                Although language-driven agents can address some real-world tasks using external tools, their reliance on solely text-based information limits their applicability to tasks where <em>visual grounding</em> is required.
                In contrast, <em>real sensory input</em> is integral to many daily human activities&mdash;allowing a deep connection to and understanding of the 
                real world around us.
                Agents can leverage street view imagery through the V-<i>IRL</i> platform to <em>visually ground</em> themselves in the real world&mdash;opening up a wide range of <em>perception-driven tasks</em>.
            </p>
        </div>
        
        <!-- RX-399 -->
        <div id="rx-399" class="agent-block vision">
            <div class="l-screen grey-overlay"></div>
            <h3 class="text"><img style="width: 35px" src="static/img/avatars/robot.png"> RX-399: Urban Assistance Robot</h3>

            <video class="auto-video" muted autoplay preload="auto" playsinline>
                <source src="static/video/story/rx399.mp4" type="video/mp4">
                Your browser does not support the video tag.
            </video>

            <p class="text">
                RX-399 navigates along pre-defined city routes, tagging all trash bins using its open-world detector and geolocation module as depicted in the following figure and videos. 
            </p>

            <div id="slider-img-rx399" class="slider-img-container">
                <div class="my-slides">
                <img data-zoomable src="static/img/rx-399/rx-399_clean_ny.jpg" style="width:100%">
                </div>
            
                <div class="my-slides">
                <img data-zoomable src="static/img/rx-399/rx-399_clean_hk.jpg" style="width:100%">
                </div>
                
                <a class="prev" onclick="plusSlides('slider-img-rx399', -1)">❮</a>
                <a class="next" onclick="plusSlides('slider-img-rx399', 1)">❯</a>
                
                <figcaption id="caption" style="margin-bottom: 10px; text-align: center"></figcaption>
            
                <div class="row">
                    <div class="column">
                        <img class="demo cursor" src="static/img/rx-399/rx-399_clean_ny.jpg" style="width:100%" onclick="currentSlide('slider-img-rx399', 1)" alt="Portions of RX-399's system records in New York City.">
                    </div>
                    <div class="column">
                        <img class="demo cursor" src="static/img/rx-399/rx-399_clean_hk.jpg" style="width:100%" onclick="currentSlide('slider-img-rx399', 2)" alt="Portions of RX-399's system records in Hong Kong">
                    </div>
                </div>
            </div>

            <div class="l-body">
                <div id="RX399video1Container" class="video-container">
                    <video class="video-music" controls preload="metadata" playsinline>
                        <source src="static/video/rx-399_ny.mp4" type="video/mp4">
                        Your browser does not support the video tag.
                    </video>
                </div>
                <div id="RX399video2Container" class="video-container" style="display:none;">
                    <video class="video-music" controls preload="metadata" playsinline>
                        <source src="static/video/rx-399_hk.mp4" type="video/mp4">
                        Your browser does not support the video tag.
                    </video>
                </div>
            
                <!-- Preview Images in a Flex Container -->
                <div class="preview-container">
                    <text x="165" y="-12.5" text-anchor="middle" style="font-weight: 700; font-size: 15px; font-family: sans-serif;">Switch recording videos between NYC and HK:</text>
                    <img id="RX399video1Preview" class="preview" src="static/img/previews/video_rx-399_preview.jpg" alt="Preview image of RX-399 NYC" onclick="switchVideo('RX399', 'video1Container', 'video1Preview')">
                    <img id="RX399video2Preview" class="preview" src="static/img/previews/video_rx-399_hk_preview.jpg" alt="Preview 2" onclick="switchVideo('RX399','video2Container', 'video2Preview')">
                </div>
            </div>
        
            <div class="takeaway-card">
                <div class="takeaway-head">
                    <span>Takeaway</span>
                    <div class="takeaway-tags">
                        <img src="static/img/tags/geo.png">
                        <img src="static/img/tags/cv.png">
                    </div>
                </div>
                <p class="takeaway-content">
                    V-<i>IRL</i> agents can use perceptual input to understand and interact with their environment.
                </p>
            </div>
        </div>
        
        <!-- Urban Planner -->
        <div id="imani" class="agent-block vision">
            <div class="l-screen grey-overlay"></div>
            <h3 class="text"><img style="width: 35px" src="static/img/avatars/urban_planner.png"> Imani: Urban Planner</h3>
            
            <video class="auto-video" muted autoplay preload="auto" playsinline>
                <source src="static/video/story/imani.mp4" type="video/mp4">
                Your browser does not support the video tag.
            </video>
            
            <p class="text">
                Imani sets routes spanning Central Park and objects of interest for RX-399, who traverses the routes and records all detected instances. 
                After RX-399 finishes its route, Imani analyzes the collected data by RX-399 at different levels of detail.
            </p>
            <div class="img-magnifier-container">
                <img data-zoomable id="urban_planner_img" style="width: 100%" src="static/img/urban_planner.jpg" alt="Urban Planner agent visualization">
                <figcaption>Imani's visualization of trash bins, fire hydrants, park benches in NYC's Central Park using data collected by RX-399. The coarsest level shows general distributions of trash bins, hydrants, and benches in the park.
                Imani can also zoom in to specific regions, where lighter colors represent positions with more unique instances identified.</figcaption>
            </div>

            <aside class="counting-table">
                <figure style="width: 300px">
                    <table style="margin-bottom: 5px">
                        <tr>
                            <th style="font-size: 13px;">Category</th>
                            <th style="font-size: 13px;">Trash bin</th>
                            <th style="font-size: 13px;">Hydrant</th>
                            <th style="font-size: 13px;">Bench*</th>
                        </tr>
                        <tr>
                            <td style="font-size: 13px;">Count</td>
                            <td style="font-size: 13px;">1059</td>
                            <td style="font-size: 13px;">727</td>
                            <td style="font-size: 13px;">1015</td>
                        </tr>
                    </table>
                    <figcaption class="table-caption">
                        Table 1: RX-399's counting report. *Note: contiguous benches counted as one instance.
                    </figcaption>
                </figure>
            </aside>

            <div class="l-body">
                <div id="UrbanPlannervideo1Container" class="video-container">
                    <video class="video-music" controls preload="metadata" playsinline>
                        <source src="static/video/urban_planner.mp4" type="video/mp4">
                        Your browser does not support the video tag.
                    </video>
                </div>
                <div id="UrbanPlannervideo2Container" class="video-container" style="display:none;">
                    <video class="video-music" controls preload="metadata" playsinline>
                        <source src="static/video/urban_planner_play.mp4" type="video/mp4">
                        Your browser does not support the video tag.
                    </video>
                </div>
            
                <!-- Preview Images in a Flex Container -->
                <div class="preview-container">
                    <text x="165" y="-12.5" text-anchor="middle" style="font-weight: 700; font-size: 15px; font-family: sans-serif;">Switch videos between data collecting and heatmap distribution:</text>
                    <img id="UrbanPlannervideo1Preview" class="preview" src="static/img/previews/video_urban_plan_collect_preview.jpg" alt="Preview image of urban planner exploration" onclick="switchVideo('UrbanPlanner', 'video1Container', 'video1Preview')">
                    <img id="UrbanPlannervideo2Preview" class="preview" src="static/img/previews/video_urban_plan_play_preview.jpg" alt="Preview image of urban planner checking" onclick="switchVideo('UrbanPlanner','video2Container', 'video2Preview')">
                </div>
            </div>

            <div class="takeaway-card">
                <div class="takeaway-head">
                    <span>Takeaway</span>
                    <div class="takeaway-tags">
                        <img src="static/img/tags/geo.png">
                        <img src="static/img/tags/cv.png">
                    </div>
                </div>
                <p class="takeaway-content">
                    V-<i>IRL</i> enables realistic open-world applications requiring vast geospatial and first-person visual information.
                </p>
            </div>
        </div>

        <!-- Intentional explorer -->
        <div id="hiro" class="agent-block vision">
            <div class="l-screen grey-overlay"></div>
            <h3 class="text"><img style="width: 35px" src="static/img/avatars/explorer.png"> Hiro: Seasoned Traveler (Intentional Explorer)</h3>

            <video class="auto-video" muted autoplay preload="auto" playsinline>
                <source src="static/video/story/hiro.mp4" type="video/mp4">
                Your browser does not support the video tag.
            </video>
            <p class="text">
                Driven by his intention, Hiro uses open-world detection to find a restaurant; uses VQA to select proper roads; uses place reviews and LLM to decide whether a place is suitable for his purpose.
            </p>
            
            <div id="slider-img-explorer" class="slider-img-container">
                <div class="my-slides">
                <div class="numbertext">Milestone 1 / 5</div>
                <img src="static/img/intentional_explorer/intentional_explorer_split_1.jpg" style="width:100%">
                </div>
            
                <div class="my-slides">
                <div class="numbertext">Milestone 2 / 5</div>
                <img src="static/img/intentional_explorer/intentional_explorer_split_2.jpg" style="width:100%">
                </div>

                <div class="my-slides">
                    <img src="static/img/intentional_explorer/intentional_explorer_split_3.jpg" style="width:100%">
                    <div class="numbertext">Milestone 3 / 5</div>
                </div>

                <div class="my-slides">
                    <img src="static/img/intentional_explorer/intentional_explorer_split_4.jpg" style="width:100%">
                    <div class="numbertext">Milestone 4 / 5</div>
                </div>

                <div class="my-slides">
                    <img src="static/img/intentional_explorer/intentional_explorer_split_5.jpg" style="width:100%">
                    <div class="numbertext">Milestone 5 / 5</div>
                </div>
                
                <a class="prev" onclick="plusSlides('slider-img-explorer', -1, 'explorer-aside')">❮</a>
                <a class="next" onclick="plusSlides('slider-img-explorer', 1, 'explorer-aside')">❯</a>
                
                <figcaption id="caption" style="margin-bottom: 10px; text-align: center">Visualization for Hiro's lunch exploration in HK. Concrete procedure is depicted in the following video.</figcaption>
            
                <div class="row">
                    <div class="column">
                        <img class="demo cursor" src="static/img/intentional_explorer/intentional_explorer_split_1.jpg" style="height: 60px; width: auto" onclick="currentSlide('slider-img-explorer', 1, 'explorer-aside')" alt="Visualization for Hiro's lunch exploration in HK. Concrete procedure is depicted in the following video.">
                    </div>
                    <div class="column">
                        <img class="demo cursor" src="static/img/intentional_explorer/intentional_explorer_split_2.jpg" style="height: 60px; width: auto" onclick="currentSlide('slider-img-explorer', 2, 'explorer-aside')" alt="Visualization for Hiro's lunch exploration in HK. Concrete procedure is depicted in the following video.">
                    </div>
                    <div class="column">
                        <img class="demo cursor" src="static/img/intentional_explorer/intentional_explorer_split_3.jpg" style="height: 60px; width: auto" onclick="currentSlide('slider-img-explorer', 3, 'explorer-aside')" alt="Visualization for Hiro's lunch exploration in HK. Concrete procedure is depicted in the following video.">
                    </div>
                    <div class="column">
                        <img class="demo cursor" src="static/img/intentional_explorer/intentional_explorer_split_4.jpg" style="height: 60px; width: auto" onclick="currentSlide('slider-img-explorer', 4, 'explorer-aside')" alt="Visualization for Hiro's lunch exploration in HK. Concrete procedure is depicted in the following video.">
                    </div>
                    <div class="column">
                        <img class="demo cursor" src="static/img/intentional_explorer/intentional_explorer_split_5.jpg" style="height: 60px; width: auto" onclick="currentSlide('slider-img-explorer', 5, 'explorer-aside')" alt="Visualization for Hiro's lunch exploration in HK. Concrete procedure is depicted in the following video.">
                    </div>
                </div>
            </div>
            <aside class="explorer-aside">
                Starting at the user-defined location <img src="static/img/icons/start_icon.jpg" class="inline-tag" draggable="false" style="height: 16px;">, Hiro walks down the street to find a place can fulfil his intention: "<i>Hiro is hungry and looking for a place where he can explore great local food. He cannot handle spicy food.</i>"</aside>
            <aside class="explorer-aside">
                <p>When he meets the first intersection, thanks to the interactive and sensor-rich environment, he adjusts his pose to fetch real street views for each possible path at the crossroads. He then uses these with <em>VQA</em> to <em>decide</em> to turn left:</p>
                <blockquote style="font-size: 12px">
                    <img src="static/img/icons/star.png" class="inline-tag" draggable="false" style="height: 16px;"> Road 1 has this homey, residential feel to it, which usually means cozy, family-run spots serving up the real-deal local food. It's got that quiet, laid-back dining scene that lets you really soak in the experience, take your time, and eat like one of the locals.
                </blockquote>
            </aside>
            <aside class="explorer-aside">
                <p>Then, after exploring for a block, he encounters the second intersection where he <em>looks around</em> and <em>decides</em> to turn right:</p>
                <blockquote style="font-size: 12px">
                    <img src="static/img/icons/star.png" class="inline-tag" draggable="false" style="height: 16px;"> I'm leaning towards taking Road 2. It looks promising with all those signs pointing to eateries and local food joints. That's got to mean there's a good selection of local dishes to try out. And with more places to choose from, I bet I'll have a better shot at finding something that isn't too spicy. That's a big deal for me. Road 1 just doesn't seem to cut it; it's more of a residential vibe and doesn't really shout 'food' like Road 2 does.
                </blockquote>
            </aside>
            <aside class="explorer-aside">
                <p>After a few steps, Hiro finds <i>"A One Chinese Noodles 阿一豬扒酸辣米線"</i>
                    using his <em>open-world detector</em>, and looks up 
                    its information and reviews using our <i>real-world environment</i> which <i>connects street views to places</i>. Hiro <i>decides</i> to pass on it because:</p>
                <blockquote style="font-size: 12px">
                    <img src="static/img/icons/star.png" class="inline-tag" draggable="false" style="height: 16px;">Hmm, spicy food is a no-go for me, and this place seems to be all about pork chop noodles. That might be tricky with my dietary needs. I should probably keep looking for something that fits what I can eat.
                </blockquote>
            </aside>
            <aside class="explorer-aside">
                <p>
                    Finally, at the end of this street block <img src="static/img/icons/end_icon.jpg" class="inline-tag" draggable="false" style="height: 16px;">, Hiro discovers another lunch spot called <i>Xintianfa 新天發</i>. He decides to dine there after <em>reading</em> <em>online reviews</em> praising its authentic cuisine and diverse menu:
                </p>
                <blockquote>
                    Even though opinions vary, Xintianfa presents an array of local cuisine that beckons to my desire for authentic culinary experiences. My seasoned traveler's spirit thrives on novelty and the thrill of discovery, so a restaurant with a diverse menu naturally draws me in. Additionally, the absence of any emphasis on spicy fare is a relief, given my inability to tolerate heat in my meals.
                </blockquote>
            </aside>


            <p class="video-container">
                <video class="video-music" controls preload="metadata" playsinline>
                    <source src="static/video/intentional_explorer.mp4" type="video/mp4">
                    Your browser does not support the video tag.
                </video>
            </p>

            <div class="takeaway-card">
                <div class="takeaway-head">
                    <span>Takeaway</span>
                    <div class="takeaway-tags">
                        <img src="static/img/tags/geo.png">
                        <img src="static/img/tags/lm.png">
                        <img src="static/img/tags/cv.png">
                    </div>
                </div>
                <p class="takeaway-content">
                    V-<i>IRL</i> agents can utilize visual detectors, VLMs and LLMs to iteratively perceive, decide, and interact in the environment.
                </p>
            </div>
        </div>
        
        <!-- Collaborative agents -->

        <div id="collaboration">
            <h2 class="text">Collaborative Agents<br><img class="virl-tag" src="static/img/tags/col.png"></h2>
            <div class="l-screen grey-overlay"></div>
            <p class="text">
                Humans often work together to solve complex real-world tasks. This collaboration promotes efficiency and effectiveness by decomposing a complex task into simpler sub-tasks, allowing each to be handled by an expert in its domain. 
            </p>
        </div>
        <div id="ling" class="agent-block collaboration">
            <div class="l-screen grey-overlay"></div>
            <h3 class="text"><img style="width: 35px" src="static/img/avatars/tourist.png"> Ling: Tourist</h3>

            <video class="auto-video" muted autoplay preload="auto" playsinline>
                <source src="static/video/story/ling.mp4" type="video/mp4">
                Your browser does not support the video tag.
            </video>
            <p class="text">
                After obtaining route descriptions from Locals, Ling starts her journey. Grounded in our embodied platform, Ling can adjust her pose and identify visual landmarks along the streets using open-world recognition and her map. Recognizing these landmarks helps GPT-4 to make correct decisions about where to turn direction, move forward and stop. Concrete examples are shown in the following figure and videos.
            </p>

            <div id="slider-img-tourist" class="slider-img-container">
                <div class="my-slides">
                <img data-zoomable src="static/img/tourist/tourist_nyc_1.jpg">
                </div>
            
                <div class="my-slides">
                <img data-zoomable src="static/img/tourist/tourist_nyc_2.jpg">
                </div>

                <div class="my-slides">
                    <img data-zoomable src="static/img/tourist/tourist_sf.jpg">
                </div>

                <div class="my-slides">
                    <img data-zoomable src="static/img/tourist/tourist_hk.jpg">
                </div>
                
                <a class="prev" onclick="plusSlides('slider-img-tourist', -1, 'tourist-aside')">❮</a>
                <a class="next" onclick="plusSlides('slider-img-tourist', 1, 'tourist-aside')">❯</a>
                
                <figcaption id="caption" style="margin-bottom: 10px; text-align: center"></figcaption>
            
                <div class="row">
                    <div class="column">
                        <img class="demo cursor" src="static/img/tourist/tourist_nyc_1.jpg" style="height: 60px; width: auto" onclick="currentSlide('slider-img-tourist', 1, 'tourist-aside')" alt="Ling and Local collaboration examples in New York City.">
                    </div>
                    <div class="column">
                        <img class="demo cursor" src="static/img/tourist/tourist_nyc_2.jpg" style="height: 60px; width: auto" onclick="currentSlide('slider-img-tourist', 2, 'tourist-aside')" alt="Another Ling and Local collaboration examples in New York City.">
                    </div>
                    <div class="column">
                        <img class="demo cursor" src="static/img/tourist/tourist_sf.jpg" style="height: 60px; width: auto" onclick="currentSlide('slider-img-tourist', 3, 'tourist-aside')" alt="Ling and Local collaboration examples in San Francisco.">
                    </div>
                    <div class="column">
                        <img class="demo cursor" src="static/img/tourist/tourist_hk.jpg" style="height: 60px; width: auto" onclick="currentSlide('slider-img-tourist', 4, 'tourist-aside')" alt="Ling and Local collaboration examples in Hong Kong.">
                    </div>
                </div>
            </div>
            <aside class="tourist-aside">
                Ling successfully find a nearby gift store by following the route description from Local agent.
            </aside>
            <aside class="tourist-aside">
                Ling successfully find a good burger spot by following the route description from Local agent.
            </aside>
            <aside class="tourist-aside">
                Ling passes by the destination because only the wall of the Apple store is visible from her viewpoint. Fortunately, she can ask another Local agent nearby to start another round of navigation, which eventually leads her to the destination. Ling's first and second attempts are shown in red and green trajectories, respectively.
            </aside>
            <aside class="tourist-aside">
                Ling mistakes another restaurant as her destination at her first attempt. She then can ask another Local agent nearby to start another round of navigation, which eventually leads her to the destination. Ling's first and second attempts are shown in red and green trajectories, respectively.
            </aside>

            <div class="l-body">
                <div id="Touristvideo1Container" class="video-container">
=                    <video class="video-music" controls preload="metadata" playsinline>
                        <source src="static/video/tourist_sf.mp4" type="video/mp4">
                        Your browser does not support the video tag.
                    </video>
                </div>
                <div id="Touristvideo2Container" class="video-container" style="display:none;">
                    <video class="video-music" controls preload="metadata" playsinline>
                        <source src="static/video/tourist_hk.mp4" type="video/mp4">
                        Your browser does not support the video tag.
                    </video>
                </div>
            
                <!-- Preview Images in a Flex Container -->
                <div class="preview-container">
                    <text x="165" y="-12.5" text-anchor="middle" style="font-weight: 700; font-size: 15px; font-family: sans-serif;">Switch videos between SF and HK journeys:</text>
                    <img id="Touristvideo1Preview" class="preview" src="static/img/previews/video_toursit_sf_preview.jpg" alt="Preview image of tourist-local SF" onclick="switchVideo('Tourist', 'video1Container', 'video1Preview')">
                    <img id="Touristvideo2Preview" class="preview" src="static/img/previews/video_tourist_hk_preview.jpg" alt="Preview image of tourist-local HK" onclick="switchVideo('Tourist','video2Container', 'video2Preview')">
                </div>
            </div>

            <div class="takeaway-card">
                <div class="takeaway-head">
                    <span>Takeaway</span>
                    <div class="takeaway-tags">
                        <img src="static/img/tags/geo.png">
                        <img src="static/img/tags/lm.png">
                        <img src="static/img/tags/cv.png">
                        <img src="static/img/tags/col.png">
                    </div>
                </div>
                <p class="takeaway-content">
                    V-<i>IRL</i> agents can collaborate to solve complex tasks that are beyond their individual expertise.
                </p>
            </div>
        </div>

        <div id="diego" class="agent-block collaboration">
            <div class="l-screen grey-overlay"></div>
            <h3 class="text"><img style="width: 35px" src="static/img/avatars/concierge.png"> Diego: Expert Concierge</h3>
            <video class="auto-video" muted autoplay preload="auto" playsinline>
                <source src="static/video/story/diego.mp4" type="video/mp4">
                Your browser does not support the video tag.
            </video>
            <p class="text">
            As depicted in the following figure, Diego's itinerary is tailored to your needs. Diego not only considers your physical and mental interoception status, budget for each activity, but also anticipates your status changes and cost when you follow each event. 
            He is able to take into account <em>real</em> travel times from the V-<i>IRL</i> platform and select suitable dining options by collaborating with another 
            restaurant recommendation agent.
            </p>
            <d-figure class="l-page">
                <figure>
                    <video id="diego-plan-video" playsinline autoplay loop muted>
                        <source src="static/video/interactive_concierge_gif.mp4" type="video/mp4">
                        Your browser does not support the video tag.
                    </video>
                    <figcaption id="diego-plan-video-cap"><em>The Perfect Day Itinerary</em>: Crafted by Diego, our iterative concierge agent, this schedule is meticulously tailored, accounting for your mental and physical well-being and budget variations as your day unfolds.</figcaption>
                </figure>
            </d-figure>
            <p class="text">
                You can intervene Diego's planning process by adjusting your interoception status or providing verbal feedback for Diego.
                In response, Diego promptly revises his original plan to make it accommodate your demands, and re-estimate your state changes after revision. (see the following figures)
            </p>

            <div>
                <img src="static/img/diego_intervent/diego_origin.jpg" class="diego-revise-img">
            </div>
            <div id="slider-img-diego-revise-state" class="slider-img-container">
                <div class="my-slides">
                    <img src="static/img/diego_intervent/adjust_state_1.jpg" class="diego-revise-img">
                </div>
            
                <div class="my-slides">
                    <img src="static/img/diego_intervent/adjust_state_2.jpg" class="diego-revise-img">
                </div>

                <div class="my-slides">
                    <img src="static/img/diego_intervent/adjust_state_3.jpg" class="diego-revise-img">
                </div>
                
                <a class="prev" id="prev-diego-revise-1" onclick="plusSlides('slider-img-diego-revise-state', -1)">❮</a>
                <a class="next" id="next-diego-revise-2" onclick="plusSlides('slider-img-diego-revise-state', 1)">❯</a>
            </div>

            <div id="slider-img-diego-revise-verbal" class="slider-img-container">
                <div class="my-slides">
                    <img src="static/img/diego_intervent/verbal_1.jpg" class="diego-revise-img">
                </div>
            
                <div class="my-slides">
                    <img src="static/img/diego_intervent/verbal_2.jpg" class="diego-revise-img">
                </div>

                <div class="my-slides">
                    <img src="static/img/diego_intervent/verbal_3.jpg" class="diego-revise-img">
                </div>
                
                <a class="prev" id="prev-diego-revise-2" onclick="plusSlides('slider-img-diego-revise-verbal', -1)">❮</a>
                <a class="next" id="next-diego-revise-2" onclick="plusSlides('slider-img-diego-revise-verbal', 1)">❯</a>
            </div>
            <figcaption style="width: 100%; text-align: center; margin-bottom: 20px;">Diego adapts original plan to suit user's intervention.</figcaption>
            
            <p class="text">
                Behind Diego's proficiency in developing itineraries is his iterative planning pipeline.
                The process begins with Diego creating an initial draft plan for the first activity using <i>GPT-4</i>, taking into account the user's biography, requirements, and previous activities in working memory. This draft is then meticulously refined by <i>hierarchical coordination</i> (real geospatial/place information), <i>interoceptive estimation</i> (activity cost and influence for human states) and <i>supervisor</i> (human interoception, budget and potential intervention).
            </p>
            <div>
                <video playsinline autoplay loop muted style="width: 100%">
                    <source src="static/video/interactive_conciege_pipeline_gif.mp4" type="video/mp4">
                    Your browser does not support the video tag.
                </video>
                <figcaption style="width: 100%; text-align: center">Pipeline overview of interactive concierge agent Diego.</figcaption>
            </div>

            <p class="text">
                Additionally, grounded on tightly related street views and Map in V-<i>IRL</i>, Diego travels places in his itinerary to scout for potential scenic viewpoints for you as shown in the following figure. 
                He uses VQA to assess each captured views, attaching highly rated positions to your itinerary.
            </p>

            <div id="slider-img-photographer" class="slider-img-container">
                <div class="my-slides">
                    <img src="static/img/photographer/photographer_1.jpg">
                    <div class="overlay"><i>Geo Location: [40.8649162, -73.9311561]</i></div>
                </div>
            
                <div class="my-slides">
                    <img src="static/img/photographer/photographer_2.jpg">
                    <div class="overlay"><i>Geo Location: [40.8647205, -73.9325163]</i></div>
                </div>

                <div class="my-slides">
                    <img src="static/img/photographer/photographer_3.jpg">
                    <div class="overlay"><i>Geo Location: [40.8653388, -73.9322499]</i></div>
                </div>

                <div class="my-slides">
                    <img src="static/img/photographer/photographer_4.jpg">
                    <div class="overlay"><i>Geo Location: [40.8609142,-73.9324818]</i></div>
                </div>

                <div class="my-slides">
                    <img src="static/img/photographer/photographer_5.jpg">
                    <div class="overlay"><i>Geo Location: [40.8642401,-73.9325958]</i></div>
                </div>
                
                <a class="prev" onclick="plusSlides('slider-img-photographer', -1)">❮</a>
                <a class="next" onclick="plusSlides('slider-img-photographer', 1)">❯</a>
                
                <figcaption id="caption" style="margin-bottom: 10px; text-align: center">Diego rates scenery and records attractive locations in your itinerary.</figcaption>

                <div class="row">
                    <div class="column">
                        <img class="demo cursor" src="static/img/photographer/photographer_1.jpg" style="height: 60px; width: auto" onclick="currentSlide('slider-img-photographer', 1)" alt="Diego rates scenery and records attractive locations in your itinerary.">
                    </div>
                    <div class="column">
                        <img class="demo cursor" src="static/img/photographer/photographer_2.jpg" style="height: 60px; width: auto" onclick="currentSlide('slider-img-photographer', 2)" alt="Diego rates scenery and records attractive locations in your itinerary.">
                    </div>
                    <div class="column">
                        <img class="demo cursor" src="static/img/photographer/photographer_3.jpg" style="height: 60px; width: auto" onclick="currentSlide('slider-img-photographer', 3)" alt="Diego rates scenery and records attractive locations in your itinerary.">
                    </div>
                    <div class="column">
                        <img class="demo cursor" src="static/img/photographer/photographer_4.jpg" style="height: 60px; width: auto" onclick="currentSlide('slider-img-photographer', 4)" alt="Diego rates scenery and records attractive locations in your itinerary.">
                    </div>
                    <div class="column">
                        <img class="demo cursor" src="static/img/photographer/photographer_5.jpg" style="height: 60px; width: auto" onclick="currentSlide('slider-img-photographer', 5)" alt="Diego rates scenery and records attractive locations in your itinerary.">
                    </div>
                </div>
            </div>
            <br>
            <div class="takeaway-card">
                <div class="takeaway-head">
                    <span>Takeaway</span>
                    <div class="takeaway-tags">
                        <img src="static/img/tags/geo.png">
                        <img src="static/img/tags/lm.png">
                        <img src="static/img/tags/cv.png">
                        <img src="static/img/tags/col.png">
                    </div>
                </div>
                <p class="takeaway-content">
                    V-<i>IRL</i> agents can collaborate with users to solve complex tasks that require understanding the user's internal state.
                </p>
            </div>
        </div>


        </div>


        <!-- Part 2: System fundamentals -->
        <div id="system" style="position: relative">
        <h2 class="text"><img src="static/img/icons/system.png" width="50px"> System Fundamentals</h2>
        <p class="text">
            Here we describe <strong>V-<i>IRL</i></strong>'s hierarchical architecture that transforms real cities around the world into a vast virtual playground in which agents can be constructed to solve practical tasks.
            The <i><u>platform</u></i> lies at the foundation&mdash;providing the underlying components and infrastructure for agents to employ.
            Higher level <i><u>capabilities</u></i> of <em>Perception</em>, <em>Reasoning</em>, </em>Action</em>, and <em>Collaboration</em> emerge from the platform's components.
            Finally, <i><u>agents</u></i> leverage these capabilities along with user-defined metadata in task-specific <code>run()</code> routines to solve tasks.
        </p>
        <p class="click-hint"><strong><img src="static/img/icons/click.gif" style="width: 35px"> Click on any individual module to view its detailed description</strong></p>
        <div class="clickable-image-container">
            <img src="static/img/architecture.jpg" alt="Architecture figure" style="width: 100%" draggable="false">

            <!-- clickable regions Agent -->
            <div class="clickable-region" style="top: 5.5%; left: 1.5%; width: 4.5%; height: 12.8%;" onclick="showInfo('agent-info');"></div>
            <div class="clickable-region" style="top: 2.8%; left: 9.5%; width: 28.5%; height: 6.8%;" onclick="showInfo('backgroundInfo');"></div>
            <div class="clickable-region" style="top: 2.8%; left: 39%; width: 28%; height: 6.8%;" onclick="showInfo('intentionInfo');"></div>
            <div class="clickable-region" style="top: 2.8%; left: 67.5%; width: 28%; height: 6.8%;" onclick="showInfo('interoceptiveInfo');"></div>
            <div class="clickable-region" style="top: 11%; left: 12%; width: 10%; height: 11%;" onclick="showInfo('agent-location');"></div>
            <div class="clickable-region" style="top: 11%; left: 27%; width: 10%; height: 11%;" onclick="showInfo('agent-biography');"></div>
            <div class="clickable-region" style="top: 11%; left: 40%; width: 10%; height: 11%;" onclick="showInfo('agent-goal');"></div>
            <div class="clickable-region" style="top: 11%; left: 55%; width: 10%; height: 11%;" onclick="showInfo('agent-task');"></div>
            <div class="clickable-region" style="top: 11%; left: 69.5%; width: 10%; height: 11%;" onclick="showInfo('agent-mental');"></div>
            <div class="clickable-region" style="top: 11%; left: 84%; width: 10%; height: 11%;" onclick="showInfo('agent-physical');"></div>
            <!-- clickable regions Capability -->
            <div class="clickable-region" style="top: 22.5%; left: 1.5%; width: 4.5%; height: 22.5%;" onclick="showInfo('capability-info');"></div>
            <div class="clickable-region" style="top: 30%; left: 10%; width: 21%; height: 7%;" onclick="showInfo('capability-perception');"></div>
            <div class="clickable-region" style="top: 30%; left: 31.5%; width: 20.5%; height: 7%;" onclick="showInfo('capability-reasoning');"></div>
            <div class="clickable-region" style="top: 30%; left: 53%; width: 20.5%; height: 7%;" onclick="showInfo('capability-action');"></div>
            <div class="clickable-region" style="top: 30%; left: 74.5%; width: 20.5%; height: 7%;" onclick="showInfo('capability-collaboration');"></div>
            <!-- clickable regions Platform -->
            <div class="clickable-region" style="top: 62.5%; left: 1.5%; width: 4.5%; height: 16.5%;" onclick="showInfo('platform-info');"></div>
            <div class="clickable-region" style="top: 46%; left: 9%; width: 47%; height: 8%;" onclick="showInfo('platform-cv');"></div>
            <div class="clickable-region" style="top: 46%; left: 57%; width: 40%; height: 8%;" onclick="showInfo('platform-lm');"></div>
            <div class="clickable-region" style="top: 73%; left: 14%; width: 77%; height: 7%;" onclick="showInfo('platform-env');"></div>
            <div class="clickable-region" style="top: 55.5%; left: 9%; width: 13.5%; height: 15%;" onclick="showInfo('platform-cv-owr');"></div>
            <div class="clickable-region" style="top: 55.5%; left: 23%; width: 13.5%; height: 15%;" onclick="showInfo('platform-cv-loc');"></div>
            <div class="clickable-region" style="top: 55.5%; left: 37%; width: 13.5%; height: 15%;" onclick="showInfo('platform-cv-fm');"></div>
            <div class="clickable-region" style="top: 55.5%; left: 51%; width: 13%; height: 15%;" onclick="showInfo('platform-cv-vqa');"></div>
            <div class="clickable-region" style="top: 55.5%; left: 64.5%; width: 15.5%; height: 15%;" onclick="showInfo('platform-lm-tool');"></div>
            <div class="clickable-region" style="top: 55.5%; left: 80.5%; width: 15.5%; height: 15%;" onclick="showInfo('platform-lm-interact');"></div>
            <div class="clickable-region" style="top: 81%; left: 15%; width: 13.5%; height: 16%;" onclick="showInfo('platform-env-street');"></div>
            <div class="clickable-region" style="top: 81%; left: 30.5%; width: 13.5%; height: 16%;" onclick="showInfo('platform-env-geoloc');"></div>
            <div class="clickable-region" style="top: 81%; left: 45.5%; width: 13.5%; height: 16%;" onclick="showInfo('platform-env-move');"></div>
            <div class="clickable-region" style="top: 81%; left: 61%; width: 13.5%; height: 16%;" onclick="showInfo('platform-env-map');"></div>
            <div class="clickable-region" style="top: 81%; left: 76.5%; width: 13.5%; height: 16%;" onclick="showInfo('platform-env-place');"></div>
        </div>
        
        <!-- aside information -->
        <!-- aside information Agent -->
        <aside class="system-fundamental-aside" id="agent-info">
            <p>In our system, <strong>agent behavior</strong> is shaped by user-defined metadata, including a background, an intended goal, and an interoceptive state.</p>
            <p>Concretely, agents are developed by writing task-specific <code>run()</code> routines that leverage the various components of our platform and the agent's metadata to solve tasks.</p>
        </aside>
        <aside class="system-fundamental-aside" id="backgroundInfo">
            The <strong>background</strong> provides the context necessary to instantiate the agent in the real world (location), and to guide its reasoning and decision making (biography).
        </aside>
        <aside class="system-fundamental-aside" id="intentionInfo"> <strong>Intentions</strong> outline an agents' purpose within the environment.</aside>
        <aside class="system-fundamental-aside" id="interoceptiveInfo">An agent's <strong>interoceptive state</strong> reflects its internal mental and physical status&mdash;varying over time and influencing its behavior.</aside>
        <aside class="system-fundamental-aside" id="agent-location">
            <p>Users define an agent's initial <strong>location</strong>, which can be geocoordinates or an address.</p> 
            <p>All exemplar V-<i>IRL</i> agents leverage this information.</p></aside>
        <aside class="system-fundamental-aside" id="agent-biography">
            <p>Users define an agent's <strong>biography</strong>, which can include any information (e.g. age, job, hometown, personality, relationship, etc.) about the agent.</p>
            <p>This information is typically provided as context to inform the decisions the agent makes using LLMs, such as with the Place Recommender agents <a href="#aria">Aria</a> and <a href="#vivek">Vivek</a>, Intentional Explorer agent <a href="#hiro">Hiro</a>, and the Interactive Concierge agent <a href="#diego">Diego</a>.</p>
        </aside>
        <aside class="system-fundamental-aside" id="agent-goal">
            <p>Users define an agent's <strong>goal</strong> (intention) in plain English to help guide the agent's behavior. Goals are specified in the "intention" term in each <i>character card</i>.</p>
            <p>This information is typically provided as context to inform the decisions the agent makes using LLMs, such as with the Place Recommender agents <a href="#aria">Aria</a> and <a href="#vivek">Vivek</a>, Intentional Explorer agent <a href="#hiro">Hiro</a>, and the Interactive Concierge agent <a href="#diego">Diego</a>.</p>
        </aside>
        <aside class="system-fundamental-aside" id="agent-task">
            <p>An agent's <strong>task</strong> describes the high-level behavior expected of it. It is specified as the "Task" term in each <i>character card</i>.</p>
            <p>This information is for expositional purposes only; it is not used in the agent's decision-making context and does not directly influence agent behavior.</p>
        </aside>
        <aside class="system-fundamental-aside" id="agent-mental">
            <p>Users define an agent's <strong>mental state</strong>, which can be custom defined to include any aspects such as joy, stress, sadness, etc.</p>
            <p>In our examples, this information is mainly used in the "Interoceptive Estimator" and "Supervisor" of Interactive Concierge agent <a href="#diego">Diego</a>.</p>
        </aside>
        <aside class="system-fundamental-aside" id="agent-physical">
            <p>Users define an agent's <strong>physical state</strong>, which can be custom defined to include any aspects such as hunger, energy, pain, etc.</p>
            <p>In our examples, this information is mainly used in the "Interoceptive Estimator" and "Supervisor" of Interactive Concierge agent <a href="#diego">Diego</a>.</p>
        </aside>
        <!-- aside information Capability -->
        <aside class="system-fundamental-aside" id="capability-info">
            <p><strong>Capabilities</strong> bridge the agent with the platform. Agents are able to leverage different capabilities along with the user-defined metadata to accomplish a multitude of tasks.</p>
            <p>In <a href="#agent-exemplars">&sect;V-<i>IRL</i>  agents</a> section, we present agents that exhibit increasingly complex behaviors, each requiring additional components from the platform.
            From simple combinations, like the Route Optimizer agent 
            <a href="#peng">Peng</a>, to more complex arrangements, 
            like the Tourist agent <a href="#ling">Ling</a>,
            we showcase the versatility of the V-<i>IRL</i> platform and its applicability to a wide range of real-world scenarios.</p>
        </aside>
        <aside class="system-fundamental-aside" id="capability-perception"> 
            <p><strong>Perception</strong> capabilities enable agents to process the sensory-rich data provided by the <i>environment</i>, especially street view imagery.</p>
            <p>This capability is used in agents with <img src="static/img/tags/cv.png" class="inline-tag" draggable="false">: <a href="#rx-399">RX-399</a>, Urban Planner, Intentional Explorer, Tourist, and Interactive Concierge.</p>
        </aside>
        <aside class="system-fundamental-aside" id="capability-reasoning">
            <p><strong>Reasoning</strong> capabilities allow decision making based on information from perception and the environment.</p>
            <p>This capability is used in agents with <img src="static/img/tags/lm.png" class="inline-tag" draggable="false">: Place Recommender, Intentional Explorer, Tourist, Local, and Interactive Concierge.</p>
        </aside>
        <aside class="system-fundamental-aside" id="capability-action">
            <p><strong>Action</strong> capabilities are responsible for grounding agents in the world around them; providing a navigable representation and geospatial information of real cities.</p>
            <p>This capability is used in agents with <img src="static/img/tags/geo.png" class="inline-tag" draggable="false"> (all agents).</p>
        </aside>
        <aside class="system-fundamental-aside" id="capability-collaboration">
            <p><strong>Collaboration</strong> capabilities enable the interaction between agents or with humans.</p>
            <p>This capability is used in agents with <img src="static/img/tags/col.png" class="inline-tag" draggable="false">, such as Tourist-Local and Interactive Concierge.</p>
        </aside>
        <!-- aside information Platform -->
        <aside class="system-fundamental-aside" id="platform-info">
            <p><strong>Platform</strong> components provide the infrastructure to instantiate capabilities, execute agent actions, and ground agents in the real world.</p>
        </aside>
        <aside class="system-fundamental-aside" id="platform-cv">
            <p><strong>Computer Vision</strong> modules instantiate vision models that are used to enable <i>perception</i> capabilities.</p>
        </aside>
        <aside class="system-fundamental-aside" id="platform-lm">
            <strong>Language Model</strong> modules instantiate LLMs which enable <i>reasoning</i> and <i>collaboration</i> capabilities via question answering, interaction, and tool & API use.</aside>
        <aside class="system-fundamental-aside" id="platform-env">
            <strong>Environment</strong> modules are responsible for grounding agents in the world around them: providing a navigable representation of real cities.
        </aside>
        <aside class="system-fundamental-aside" id="platform-cv-owr">
            <strong>Open-world recognition</strong> models <d-cite key="radford2021learning"></d-cite> are more general than localization models, and allow agents to detect the presence of a wide range of objects in their field of view (e.g., the Tourist searches for the Apple Store).
        </aside>
        <aside class="system-fundamental-aside" id="platform-cv-loc">
            <strong>Localization</strong> models <d-cite key="li2022grounded"></d-cite> give agents a precise spatial understanding of their environment.
            This allows <a href="#rx-399">RX-399</a> to identify and count instances of objects, and <a href="#hiro">Hiro</a> to pick out specific businesses to look up using the environment APIs.
        </aside>
        <aside class="system-fundamental-aside" id="platform-cv-fm">
            <strong>Feature matching</strong> models <d-cite key="lindenberger2023lightglue"></d-cite> provide an understanding of continuity across views of the same location, and enable agents to identify & deduplicate instances of the same object from different viewpoints (e.g., <a href="#rx-399">RX-399</a>, Urban Planner, and Intentional Explorer <a href="#hiro">Hiro</a>).
        </aside>
        <aside class="system-fundamental-aside" id="platform-cv-vqa">
            Multimodal models with <strong>VQA</strong> capabilities <d-cite key="li2023blip"></d-cite> bridge the perceptual world with natural language, and
            are essential for integration with <i>reasoning</i>.
            They allow agents to verbalize visual information, transforming it into text that can be used for decision-making (e.g., the Intentional Explorer <a href="#hiro">Hiro</a> asks "What is road is better?" at intersections).
        </aside>
        <aside class="system-fundamental-aside" id="platform-lm-tool">
            LLMs such as GPT-4 <d-cite key="openai2023gpt"></d-cite> and Llama&nbsp;2 <d-cite key="touvron2023llama"></d-cite>
            enable V-<i>IRL</i> agents to <strong>use tools</strong> and
            interface across various <strong>APIs</strong>, transforming environmental data and perceptual outputs into actionable insights.
            For example, the Real Estate agent <a href="#vivek">Vivek</a> uses real estate APIs to find properties that match a user's preferences and the Place Recommender agent <a href="#aria">Aria</a> uses Google Maps APIs to find places that match a user's preferences.
        </aside>
        <aside class="system-fundamental-aside" id="platform-lm-interact">
            LLMs enable <strong>collaboration</strong> between agents or with humans through natural language (see 
            Tourist-Local agent <a href="#ling">Ling</a> and Interactive Concierge agent <a href="#diego">Diego</a>). Custom prompts facilitate these interactions. 
        </aside>
        <aside class="system-fundamental-aside" id="platform-env-street">
            Agents can access first-person <strong>Street View Imagery</strong> cooresponding to their geolocation, field-of-view, and ego-pose. This module provides a one-to-one correspondence between street view panorama and geolocation.
        </aside>
        <aside class="system-fundamental-aside" id="platform-env-geoloc">
            The <strong>Geolocation</strong> (a set of lat-long geocoordinates) is fundamental to any embodied agent, representing its position in the world and allowing it to interoperate with other location-based APIs and modules (e.g., Movement).
            Agents can also translate natural language addresses into a geolocation using the Geolocation module.
        </aside>
        <aside class="system-fundamental-aside" id="platform-env-move">
            Through the <strong>Movement</strong> module, an agent can obtain all navigable directions and positions that have street view imagery around its current location.
            Agents can then use the Movement module to move to a new location, updating their geolocation.
        </aside>
        <aside class="system-fundamental-aside" id="platform-env-map">
            The <strong>Mapping</strong> module allows agents to get available routing, time, and distance from one location to another for use in decision-making.
        </aside>
        <aside class="system-fundamental-aside" id="platform-env-place">
            The <strong>Place Info & Search</strong> module enables agents to lookup information about nearby destinations using search-parameters such as name, place type, radius, etc.
            It provides place information such as name, address, reviews, photos, and so on.
        </aside>

        <figcaption style="width: 100%; text-align: center">Hierarchical V-<i>IRL</i> architecture.</figcaption>
        </div>


        <!-- Part 3: Benchmark -->
        <div id="benchmark" style="position: relative">
        <h2 class="text"><img src="static/img/icons/benchmark.png" width="50px"> V-<i>IRL</i> Benchmark</h2>
        <p class="text">
            The essential attributes of V-<i>IRL</i> include its ability to access geographically diverse data derived from real-world sensory input, and its API that facilitates interaction with Google Map Platform (GMP) <d-cite key="google_map_platform"></d-cite>. 
            This enables us to develop three V-<i>IRL</i> benchmarks to assess the capabilities of existing vision models in such open-world data distribution.
        </p>

        <h3 class="text">V-<em>IRL Place:</em> Localization</h3>
        <p class="text">
            <strong>Motivation:</strong> Every day, humans traverse through cities, moving between diverse places to fulfil a range of goals, like the Intentional Explorer agent.
            We assess the performance of vision models on the everyday human activity <i>place localization</i> using street view imagery and associated place data.
        </p>
        <p class="text">
            <strong>Setups:</strong> We modify RX-399 agent to traverse polygonal areas while localizing & identifying 20 types of places.
            We evaluate three prominent open-world detection models:  GroundingDINO <d-cite key="liu2023grounding"></d-cite>, GLIP <d-cite key="li2022grounded"></d-cite>, Owl-ViT <d-cite key="minderer2022simple"></d-cite>, OpenSeeD <d-cite key="zhang2023openseed"></d-cite> and Owl-ViT v2 <d-cite key="minderer2024owlvitv2"></d-cite>. We also implement a straightforward baseline, CLIP (w/ GLIP proposal), which involves reclassifying the categories
            of GLIP proposals with CLIP <d-cite key="radford2021learning"></d-cite>.
            Models are evaluated on localization recall, which is quantified as 
            <math xmlns="http://www.w3.org/1998/Math/MathML">
                <mfrac>
                    <mrow>
                        <mi>N</mi>
                        <msub>
                            <mi></mi>
                            <mi>tp</mi>
                        </msub>
                    </mrow>
                    <mrow>
                        <mi>N</mi>
                        <msub>
                            <mi></mi>
                            <mi>tp</mi>
                        </msub>
                        <mo>+</mo>
                        <mi>N</mi>
                        <msub>
                            <mi></mi>
                            <mi>fn</mi>
                        </msub>
                    </mrow>
                </mfrac>
            </math>, where
            <math xmlns="http://www.w3.org/1998/Math/MathML">
                <mi>N</mi>
                <msub>
                    <mi></mi>
                    <mi>tp</mi>
                </msub>
            </math>
            and
            <math xmlns="http://www.w3.org/1998/Math/MathML">
                <mi>N</mi>
                <msub>
                    <mi></mi>
                    <mi>fn</mi>
                </msub>
            </math>
            represents the number of correctly localized places and missed places, respectively.
        </p>

        <d-figure>
            <figure style="margin-top: 5px; margin-bottom: 15px">
                <img data-zoomable src="static/img/bm_local_matching.jpg" alt="V-IRL Place Localization" style="width: 80%;display: block; margin: auto;">
                <figcaption style="width: 100%;">Matching between 2D object proposal and street place. we first <i>project</i> the bounding box of each object proposal onto a frustum in the 3D space, subject to a radius.
                We then determine if any <i>nearby places</i> fall within this frustum and radius. 
                If any nearby place is found, the closest one is assigned as the <i>ground truth</i> for the object proposal. Otherwise, the object proposal is regarded as a <i>false positive</i>. 
                When multiple places are inside the frustum, we consider the nearest one as the ground truth since it would likely block the others in the image.</figcaption>
            </figure>
        </d-figure>

        <p class="text">
            <strong>Results:</strong> Following table shows that open-world detectors like GroundingDINO <d-cite key="liu2023grounding"></d-cite>, Owl-ViT <d-cite key="minderer2022simple"></d-cite> and GLIP <d-cite key="li2022grounded"></d-cite> are biased towards certain place types such as <code>school</code>, <code>cafe</code>, and <code>convenience store</code>, respectively. 
            In contrast, CLIP (w/ GLIP proposal) can identify a broader spectrum of place types.
            This is mainly caused by the category bias in object detection datasets with a limited vocabulary. 
            Hence, even if detectors like Owl-ViT are initialized with CLIP, their vocabulary space narrows down due to fine-tuning. 
            These results suggest that cascading category-agnostic object proposals to zero-shot recognizers appears promising for "real" open-world localization, especially for less common categories in object detection datasets.
        </p>

        <d-figure>
            <figure style="margin-bottom: 10px; margin-top: 0px ">
                <img src="static/img/bm_table_localization.png" alt="V-IRL Place Localization Results" style="width: 100%">
                <figcaption style="width: 100%;">Benchmark results on V-<i>IRL</i> Place Localization. <math xmlns="http://www.w3.org/1998/Math/MathML">
                    <mi>AR</mi>
                    <msup>
                        <mrow></mrow>
                        <mn>10</mn>
                    </msup>
                </math> and <math xmlns="http://www.w3.org/1998/Math/MathML">
                    <mi>AR</mi>
                    <msup>
                        <mrow></mrow>
                        <mn>20</mn>
                    </msup>
                </math> denote average recall on subsampled 10 and all 20 place categories, respectively. More results in paper.</figcaption>
            </figure>
        </d-figure>

        <div class="img-grid-row">
            <div class="img-grid-column">
                <img data-zoomable src="static/img/place_loc_imgs/bookstore.jpg" style="width:100%">
                <img data-zoomable src="static/img/place_loc_imgs/bookstore2.jpg" style="width:100%">
                <img data-zoomable src="static/img/place_loc_imgs/clothing2.jpg" style="width:100%">
                <img data-zoomable src="static/img/place_loc_imgs/jewelry.jpg" style="width:100%">
            </div>
            <div class="img-grid-column">
                <img data-zoomable src="static/img/place_loc_imgs/movie.jpg" style="width:100%">
                <img data-zoomable src="static/img/place_loc_imgs/clothing.jpg" style="width:100%">
                <img data-zoomable src="static/img/place_loc_imgs/spa.jpg" style="width:100%">
                <img data-zoomable src="static/img/place_loc_imgs/spa2.jpg" style="width:100%">
            </div>
            <div class="img-grid-column">
                <img data-zoomable src="static/img/place_loc_imgs/convenience.jpg" style="width:100%">
                <img data-zoomable src="static/img/place_loc_imgs/convenience2.jpg" style="width:100%">
                <img data-zoomable src="static/img/place_loc_imgs/laundry.jpg" style="width:100%">
                <img data-zoomable src="static/img/place_loc_imgs/laundry2.jpg" style="width:100%">
            </div>
            <div class="img-grid-column">
                <img data-zoomable src="static/img/place_loc_imgs/bank.jpg" style="width:100%">
                <img data-zoomable src="static/img/place_loc_imgs/bank2.jpg" style="width:100%">
                <img data-zoomable src="static/img/place_loc_imgs/park.jpg" style="width:100%">
                <img data-zoomable src="static/img/place_loc_imgs/park2.jpg" style="width:100%">
            </div>
            <div class="img-grid-column">
                <img data-zoomable src="static/img/place_loc_imgs/cafe.jpg" style="width:100%">
                <img data-zoomable src="static/img/place_loc_imgs/bar.jpg" style="width:100%">
                <img data-zoomable src="static/img/place_loc_imgs/restaurant.jpg" style="width:100%">
                <img data-zoomable src="static/img/place_loc_imgs/bakery.jpg" style="width:100%">
            </div>
            <div class="img-grid-column">
                <img data-zoomable src="static/img/place_loc_imgs/pharmacy.jpg" style="width:100%">
                <img data-zoomable src="static/img/place_loc_imgs/lodging.jpg" style="width:100%">
                <img data-zoomable src="static/img/place_loc_imgs/hospital.jpg" style="width:100%">
                <img data-zoomable src="static/img/place_loc_imgs/hospital2.jpg" style="width:100%">
            </div>
        </div>
        <figcaption style="text-align: center; margin-bottom: 10px;">Part of V-<i>IRL</i> Place localization benchmark results via CLIP (w/ GLIP proposal).</figcaption>


        <h3 class="text">V-<em>IRL Place:</em> Recognition and VQA</h3>
        <p class="text">
            <strong>Motivation:</strong> In contrast to the challenging V-<i>IRL</i> place localization task on street view imagery, in real life, humans can recognize businesses by taking a closer, <strong>place-centric</strong> look.
            In this regard, we assess existing vision models on two perception tasks based on place-centric images: <i>i)</i> recognizing specific place types; <i>ii)</i> identifying human intentions by Vision Question Answering (VQA), named intention VQA.
        </p>

        <p class="text" style="margin-bottom: 5px;">
            <strong>Setups:</strong> For recognition, we assess 10 open-world recognition models, for place type recognition from 96 options, using <i>place-centric images</i> (see below imagery illustration).
        </p>
        <div id="slider-img-street-place" class="slider-img-container">
            <div class="my-slides">
                <img data-zoomable src="static/img/streetview_vs_place/streetview_vs_placeimage_1.jpg" style="width:100%">
            </div>
            
            <div class="my-slides">
                <img data-zoomable src="static/img/streetview_vs_place/streetview_vs_placecentric_2.jpg" style="width:100%">
            </div>

            <div class="my-slides">
                <img data-zoomable src="static/img/streetview_vs_place/streetview_vs_placecentric_3.jpg" style="width:100%">    
            </div>
                
            <a class="prev" onclick="plusSlides('slider-img-street-place', -1)">❮</a>
            <a class="next" onclick="plusSlides('slider-img-street-place', 1)">❯</a>
            
            <figcaption style="margin-bottom: 10px;">Street view imagery (left), sourced from the Google Street View database, are taken from a street-level perspective, encompassing a broad view of the surroundings, including multiple buildings. 
            Place-centric imagery (right), drawn from the Google Place database, focus predominantly on the specific place, providing a more concentrated view.</figcaption>
            <figcaption id="caption" style="margin-bottom: 10px; display: none"></figcaption>

            <div class="row">
                <div class="column">
                    <img class="demo cursor" src="static/img/streetview_vs_place/streetview_vs_placeimage_1.jpg" style="height: 60px; width: auto" onclick="currentSlide('slider-img-street-place', 1)" alt="Street view imagery (left) vs place-centric imagery (right).">
                </div>
                <div class="column">
                    <img class="demo cursor" src="static/img/streetview_vs_place/streetview_vs_placecentric_2.jpg" style="height: 60px; width: auto" onclick="currentSlide('slider-img-street-place', 2)" alt="Street view imagery (left) vs place-centric imagery (right).">
                </div>
                <div class="column">
                    <img class="demo cursor" src="static/img/streetview_vs_place/streetview_vs_placecentric_3.jpg" style="height: 60px; width: auto" onclick="currentSlide('slider-img-street-place', 3)" alt="Street view imagery (left) vs place-centric imagery (right).">
                </div>
            </div>
        </div>
        <p class="text">
        For intention VQA, we also evaluate 13 multi-modal large language models (MM-LLM) to determine viable human intentions from a four-option multiple-choice VQA. The V-<i>RL Place</i> VQA process is illustrated in following image, where the candidate and true choices are generated by GPT-4 <d-cite key="openai2023gpt"></d-cite> given the place types and place names corresponding to the image.
        </p>
        <figure style="margin-top: 0px; margin-bottom: 0px">
            <img src="static/img/bm_vqa_example.jpg" alt="V-IRL Place VQA" style="width: 80%; display: block; margin: auto;">
            <figcaption style="width: 100%; text-align: center">Example of V-<i>RL Place</i> VQA process.</figcaption>
        </figure>

        <p class="text">
            <strong>Results:</strong> Following table shows that CLIP (L/14@336px) outperforms even the biggest version of Eva-02-CLIP and SigLIP in the V-<i>RL Place</i> recognition task, emphasizing the high-quality data of CLIP. 
            The bottom of the table shows that LLaVA-NeXT (7B) outperforms its predecessors LLaVA-1.5 and 1.0, but still has over 8% gap to InternVL-1.5 with 26B parameters. Closed-source MLLMs GPT-4V and Qwen-VL-Max yield outstanding performance compared to most open-sourced models.
            We note that even these top-performing MLLMs (e.g. GPT-4V and Qwen-VL-Max) still suffer from inconsistent issues during the circular evaluation .
            Moreover, vision models perform better on place VQA over place-type recognition, suggesting direct prompts about human intention could be more effective for intention-driven tasks.
        </p>
        <figure style="margin-top: 0px; margin-bottom: 0px">
            <img src="static/img/bm_table_rec_vqa.png" alt="V-IRL Place VQA" style="width: 80%; display: block; margin: auto;">
            <figcaption style="width: 100%; text-align: left">Benchmark results on V-<i>RL Place</i> recognition and V-<i>RL Place</i> VQA. <span class="resscolor">Green</span> indicates increased resolution models, while <span class="sizescolor">Blue</span> denotes model parameter scaling.</figcaption>
        </figure>

        <h3 class="text">V-<i>IRL</i> Vision Language Navigation</h3>
        <p class="text">
            <strong>Motivation:</strong> As discussed in the V-<i>IRL</i> agents section, Intentional Explorer and Tourist agents require collaboration between vision models and language models to accomplish complex tasks. Therefore, this motivates us to investigate the performance of vision-language collaboration, with environmental information acquired through visual perception models from real-world images. 
            This prompts us to build an embodied task for jointly leveraging vision and language models along with the realistic street views in V-<i>IRL</i>. 
            In this regard, we build this V-<i>IRL</i> Vision Language Navigation (VLN) benchmark.
        </p>

        <p class="text">
            <strong>Setups:</strong> We adapt the Tourist agent implementation and replace its recognition component with the various benchmarked models. These methods are tasked to identify visual landmarks during navigation. Subsequently, GPT-4 <d-cite key="openai2023gpt"></d-cite> predicts the next action according to the recognition results. Navigation instructions are generated using the Local agent.<br> 
            Four approaches are evaluated to recognize landmarks during navigation: <i>(i)</i> Approximate oracle by searching nearby landmarks; <i>(ii)</i> Zero-shot recognizers CLIP <d-cite key="radford2021learning"></d-cite> and EVA-02-CLIP <d-cite key="EVA-CLIP"></d-cite>; <i>(iii)</i> Multi-modal LLM LLaVA-1.5 <d-cite key="liu2023improvedllava"></d-cite>
            <i>(iv)</i> OCR model <d-cite key="du2009pp"></d-cite> to recognize potential text in street views followed by GPT answer parsing.
        </p>

        <p class="text">
            <strong>Results:</strong> Following table shows that, with <em>oracle landmark information</em>, powerful LLMs can impressively comprehend navigation instructions and thus make accurate decisions. However, when using vision models to fetch landmark information from street views, the success rate drops dramatically, suggesting that the perception of vision models is noisy and misguides LLMs' decision making. Among these recognizers, larger variants of CLIP <d-cite key="radford2021learning"></d-cite> and EVA-02-CLIP <d-cite key="EVA-CLIP"></d-cite> perform better, highlighting the benefits of model scaling. LLaVA-1.5 <d-cite key="liu2023improvedllava"></d-cite> shows inferior performance with CLIP (L/14@336px) as its vision encoder, possibly due to the alignment tax <d-cite key="openai2023gpt"></d-cite> during instruction tuning. 
            Further, PP-OCR <d-cite key="du2009pp"></d-cite> (+ GPT-3.5) achieves a 28% success rate, signifying that OCR is crucial for visual landmark recognition. 
        </p>

        <d-figure>
            <figure style="margin-top: 0px; margin-bottom: 0px">
                <img src="static/img/bm_table_vln.png" alt="V-IRL VLN" style="width: 80%; display: block; margin: auto;" data-zoomable>
                <figcaption style="width: 100%; text-align: left">Results on V-<i>IRL</i> VLN-mini. We test various CLIP-based models, <span class="resscolor">MM-LLM</span>, and <span class="sizescolor">OCR</span> model with GPT postprocessing.  We primarily measure navigation success rate (<i>Success</i>). In addition, as navigation success is mainly influenced by the agent's actions at key positions (<i>i.e.</i>, start positions, intersections and stop positions), we also evaluate the arrival ratio (<i>Arr</i>) and reaction accuracy (<i>Reac</i>) for each route. Arr denotes the percentage of key positions reached, while Reac measures the accuracy of the agent's action predictions at these key positions. Full-set results on CLIP and Oracle are available in paper appendix.</figcaption>
            </figure>
        </d-figure>

        <h3 id="geo-diversity" class="text">Geographic Diversity</h3>
        <p class="text">
            Spanning 12 cities across the globe, our V-<i>IRL</i> benchmarks provide an opportunity to analyze the inherent model biases in different regions. As depicted in the following figure, vision models demonstrate subpar performance on all three benchmark tasks in Lagos, Tokyo, Hong Kong, and Buenos Aires.
            In Lagos, vision models might struggle due to its non-traditional street views relative to more developed cities (see street views in aside figures). For cities like Tokyo, Hong Kong and Buenos Aires, an intriguing observation is their primary use of non-English languages in street views.
            This suggests that existing vision models face challenges with multilingual image data.
        </p>

        <div class="bm_city_fig">
            <img id="bm_city_img" data-zoomable src="static/img/bm_city_analysis.jpg" alt="City level analysis" style="width: 100%;">
            <figcaption style="width: 100%; text-align: center">City-level visualization of V-<i>IRL</i> benchmark results.</figcaption>
            <aside class="left-side-bar">
                <img id="street_lagos" data-zoomable src="static/img/bm_city/street_lagos.jpg" style="width: 240px; height: auto; ">
                <figcaption style="width: 240px; text-align: center; margin-bottom:5px">Lagos, Nigeria.</figcaption>
                <img data-zoomable id="street_tokyo" src="static/img/bm_city/street_tokyo.jpg" style="width: 240px; height: auto">
                <figcaption style="width: 240px; text-align: center; margin-bottom:5px">Tokyo, Japan </figcaption>
                <img data-zoomable id="street_tokyo" src="static/img/bm_city/street_ny.jpg" style="width: 240px; height: auto">
                <figcaption style="width: 240px; text-align: center; margin-bottom:5px">New York City, USA </figcaption>
            </aside>
            <aside class="right-side-bar">
                <img data-zoomable id="street_hk" src="static/img/bm_city/street_hk.jpg" style="width: 240px; height: auto">
                <figcaption style="width: 240px; text-align: center; margin-bottom:5px">Hong Kong, China.</figcaption>
                <img  data-zoomable id="street_ba" src="static/img/bm_city/street_Buenos_Aires.jpg" style="width: 240px; height: auto">
                <figcaption style="width: 240px; text-align: center; margin-bottom:5px">Buenos Aires, Argentina.</figcaption>
                <img  data-zoomable id="street_ba" src="static/img/bm_city/street_mumbai.jpg" style="width: 240px; height: auto">
                <figcaption style="width: 240px; text-align: center; margin-bottom:5px">Mumbai, India.</figcaption>
            </aside>
        </div>        
        </div>

        <div id="discussion" style="position: relative; margin-top: 40px; margin-bottom: 0px;">
            <h2 class="text" style="margin-top:0px; margin-bottom:10px"><img src="static/img/icons/discussion.png" width="50px"> Discussion: Ethics & Privacy</h2>
        
            <p class="text">Our platform serves as a tool for AI development and as a crucible for ethical discourse and preparation. As AI is inevitably being integrated into society&mdash;<em>e.g.</em>, via augmented reality wearables or robots navigating city streets&mdash;it is imperative to confront and discuss ethical and privacy concerns now. Unlike these impending <em>real-time</em> systems, the data accessed by <em>V-IRL</em> is "stale" and often preprocessed&mdash;providing a controlled environment to study these concerns.</p>
            
            <p class="text">Notably, <em>V-IRL</em> exclusively utilizes preexisting, readily available APIs; it does not capture or make available any previously inaccessible data. Our primary source of street-view imagery, Google Maps<d-cite key="google_map_platform"></d-cite>, is subject to major privacy-protection measures, including blurring faces and license plates <d-cite key="frome2009large"></d-cite>. 
            Moreover, <em>V-IRL</em> complies with the <a href="https://cloud.google.com/maps-platform/terms">Google Maps Platform license</a>, similarly to notable existing works that also leverage Google's street views <d-cite key="zamir2014image"></d-cite> <d-cite key="chen2019touchdown"></d-cite>.</p>
            
            <p class="text">
                We believe <em>V-IRL</em> is an invaluable tool for researching bias. As discussed in <a href="#geo-diversity">geographic diversity</a>, <em>V-IRL</em>'s <em>global scale</em> provides a lens to study linguistic, cultural, and other geographic biases inherent in models. By using <em>V-IRL</em> to study such questions, we aim to preemptively tackle the ethical dilemmas that will arise with deploying real-time systems rather than being blindsided by them. We hope our work helps spur proactive discussion of future challenges throughout the community.
            </p>
        </div>

        <div id="conclusion" style="position: relative; margin-top: 20px; margin-bottom: 40px;">
            <h2 class="text" style="margin-top:0px; margin-bottom:10px"><img src="static/img/icons/conclusion.jpg" width="50px"> Conclusion</h2>
            <p class="text">
                In this work, we introduce <em>V-IRL</em>, an open-source platform designed to bridge the sensory gap between the digital and physical worlds, enabling AI agents to interact with the real world in a virtual yet realistic environment.
                Through <em>V-IRL</em>, agents can develop rich sensory grounding and perception, utilizing real geospatial data and street-view imagery.
                We demonstrate the platform's versatility by creating diverse exemplar agents and developing benchmarks measuring the performance of foundational language and vision models on open-world visual data from across the globe.
            </p>
            <p class="text">
                This platform opens new avenues for advancing AI capabilities in perception, decision-making, and real-world data interaction.
                As spatial computing and robotic systems become increasingly prevalent, the demand for and possibilities of AI agents will only grow.
                From personal assistants to practical applications like urban planning to life-changing tools for the visually impaired, we hope <em>V-IRL</em> helps usher in a new era of perceptually grounded agents.
            </p>
        </div>

        </d-article>
        <d-appendix>
            <h3>BibTeX</h3>
            <p class="bibtex">
                @inproceedings{yang2024virl,<br>
                &nbsp;&nbsp;title={{V-IRL: Grounding Virtual Intelligence in Real Life}},<br>
                &nbsp;&nbsp;author={Yang, Jihan and Ding, Runyu and Brown, Ellis and Qi, Xiaojuan and Xie, Saining},<br>
                &nbsp;&nbsp;year={2024},<br>
                &nbsp;&nbsp;booktitle={European conference on computer vision},<br>
                }
            </p>

            <d-footnote-list></d-footnote-list>
            <d-citation-list></d-citation-list>
        </d-appendix>
        
        <!-- bibliography will be inlined during Distill pipeline's pre-rendering -->
        <d-bibliography src="bibliography.bib"></d-bibliography>
        <script src="./static/js/nav-bar.js"></script>
        
        <script type="text/javascript">
            changeContent('Place1');
            switchVideo('RX399', 'video1Container', 'video1Preview');
            switchVideo('UrbanPlanner', 'video1Container', 'video1Preview');
            switchVideo('Tourist', 'video1Container', 'video1Preview');

            let slideIndex = 1;
            showSlides("slider-img-rx399", slideIndex);
            showSlides("slider-img-tourist", slideIndex, 'tourist-aside');
            showSlides("slider-img-photographer", slideIndex);
            showSlides("slider-img-street-place", slideIndex);
            showSlides("slider-img-explorer", slideIndex, 'explorer-aside');
            showSlides("slider-img-diego-revise-verbal", slideIndex);
            showSlides("slider-img-diego-revise-state", slideIndex);

        </script>
    </body>
</html>