index.html

<!DOCTYPE html>
<html>

<head>
  <meta charset="utf-8">
  <meta name="description"
    content="The Fabrication of Reality and Fantasy: Scene Generation with LLM-Assisted Prompt Interpretation">
  <meta name="keywords"
    content="Text-to-image Generation, Realistic-Fantasy Benchmark, Diffusion Model, Large Language Models (LLMs)">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>The Fabrication of Reality and Fantasy: Scene Generation with LLM-Assisted Prompt Interpretation</title>

  <!-- Global site tag (gtag.js) - Google Analytics -->
  <script async src="https://www.googletagmanager.com/gtag/js?id=G-PYVRSFMDRL"></script>
  <script>
    window.dataLayer = window.dataLayer || [];

    function gtag() {
      dataLayer.push(arguments);
    }

    gtag('js', new Date());

    gtag('config', 'G-PYVRSFMDRL');
  </script>

  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet">

  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="./static/css/index.css">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <script src="./static/js/bulma-carousel.min.js"></script>
  <script src="./static/js/bulma-slider.min.js"></script>
  <script src="./static/js/index.js"></script>
  <style>
    figcaption {
      text-align: center;
      margin-top: 5px;
      font-size: 14px;
      color: #555;
    }
    table {
      width: 100%;
      border-collapse: collapse;
      margin: 20px 0;
    }
    th, td {
      border: 1px solid #000;
      padding: 8px;
      text-align: center;
    }
    th {
      background-color: #f2f2f2;
    }
    .highlight {
      background-color: #f2f2f2;
    }
    .improvement {
      color: red;
    }
  </style>
</head>

<body>
  <section class="hero">
    <div class="hero-body">
      <div class="container is-max-desktop">
        <div class="columns is-centered">
          <div class="column has-text-centered">
            <h1 class="title is-1 publication-title">The Fabrication of Reality and Fantasy: Scene Generation with
              LLM-Assisted Prompt Interpretation</h1>
            <h3 class="title is-3 publication-title">(ECCV 2024)</h3>
            <div class="is-size-5 publication-authors">
              <span class="author-block">
                <a href="">Yi Yao*</a><sup>1</sup>,</span>
              <span class="author-block">
                <a href="">Chan-Feng Hsu*</a><sup>1</sup>,</span>
              <span class="author-block">
                <a href="">Jhe-Hao Lin</a><sup>1</sup>,
              </span>
              <span class="author-block">
                <a href="">Hongxia Xie</a><sup>2</sup>,
              </span>
              <span class="author-block">
                <a href="">Terence Lin</a><sup>1</sup>,
              </span>
              <span class="author-block">
                <a href="">Yi-Ning Huang</a><sup>1</sup>,
              </span>
              <span class="author-block">
                <a href="https://basiclab.lab.nycu.edu.tw/">Hong-Han Shuai</a><sup>1</sup>
              </span>
              <span class="author-block">
                <a href="https://www.csie.ntu.edu.tw/~wenhuang/">Wen-Huang Cheng</a><sup>3</sup>
              </span>
            </div>

            <div>
              <span class="author-block"><sup>*</sup>Equal contribution</span>
            </div>

            <div class="is-size-5 publication-authors">
              <span class="author-block"><sup>1</sup>National Yang Ming Chiao Tung University,</span>
              <span class="author-block"><sup>2</sup>Jilin University,</span>
              <span class="author-block"><sup>3</sup>National Taiwan University</span>
            </div>

            <div class="column has-text-centered">
              <div class="publication-links">
                <!-- PDF Link. -->
                <span class="link-block">
                  <a href="https://arxiv.org/pdf/2407.12579" class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fas fa-file-pdf"></i>
                    </span>
                    <span>Paper</span>
                  </a>
                </span>
                <!-- arXiv Link. -->
                <span class="link-block">
                  <a href="https://arxiv.org/abs/2407.12579" class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="ai ai-arxiv"></i>
                    </span>
                    <span>arXiv</span>
                  </a>
                </span>
                <!-- Code Link. -->
                <span class="link-block">
                  <a href="https://github.com/leo81005/Reality-and-Fantasy" class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fab fa-github"></i>
                    </span>
                    <span>Code</span>
                  </a>
                </span>
                <!-- Dataset Link. -->
                <span class="link-block">
                  <a href="https://docs.google.com/spreadsheets/d/1AmOInyxVAnDgAILoJPDO6cJdm6wVesdBSSbx7NgEp88/edit?usp=sharing"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="far fa-images"></i>
                    </span>
                    <span>Dataset</span>
                  </a>
              </div>

            </div>
          </div>
        </div>
      </div>
    </div>
  </section>


  <section class="section">
    <div class="container is-max-desktop">
      <figure align="center">
        <img id="intro" src="./static/images/intro.png" alt="intro_result" style="max-width: 100%;">
      </figure>
    </div>
  </section>

  <section class="section">
    <div class="container is-max-desktop">
      <!-- Abstract. -->
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-3">Abstract</h2>
          <div class="content has-text-justified">
            <p>
              In spite of recent advancements in text-to-image generation, it still has limitations when it comes to
              complex, imaginative text prompts. Due to the limited exposure to diverse and complex data in their
              training sets, text-to-image models often struggle to comprehend the semantics of these difficult prompts,
              leading to the generation of irrelevant images. This work explores how diffusion models can process and
              generate images based on prompts requiring artistic creativity or specialized knowledge. Recognizing the
              absence of a dedicated evaluation framework for such tasks, we introduce a new benchmark, the
              Realistic-Fantasy Benchmark (RFBench), which blends scenarios from both realistic and fantastical realms.
              Accordingly, for reality and fantasy scene generation, we propose an innovative training-free approach,
              Realistic-Fantasy Network (RFNet), that integrates diffusion models with LLMs. Through our proposed
              RFBench, extensive human evaluations coupled with GPT-based compositional assessments have demonstrated
              our approach's superiority over other state-of-the-art methods.
            </p>
          </div>
        </div>
      </div>
      <!--/ Abstract. -->
    </div>
  </section>

  <section class="section">
    <div class="container is-max-desktop">
      <!-- Method. -->
      <div class="columns is-centered">
        <div class="column is-full-width">
          <h2 class="title is-3">Method</h2>

          <!-- Overview -->
          <h3 class="title is-4">Overview</h3>
          <div class="content has-text-justified">
            <p>
              The Realistic-Fantasy Network (RFNet) contains two stages.
              In the first stage, we transform the initial input prompt into a refined version specifically tailored for
              image generation by LLMs.
              In the second stage, we utilize a diffusion model through a two-step process to generate outputs with
              extraordinary details.
            </p>
            <img id="method" src="./static/images/framework.png" alt="method" height="100%">
          </div>

          <!-- SAA Module -->
          <h3 class="title is-4">Semantic Alignment Assessment (SAA) Module</h3>
          <div class="content has-text-justified">
            <p>
              As we proceed with generating images using the diffusion model using the details generated by the previous
              step, there is a critical challenge: <i>the description lists generated by LLMs for one object usually
                overlook the relationships among them.</i> For example, interpretations of “a lion” could range from
              being “unaware and asleep” to “frightened and trying to escape.” Although both depictions are valid,
              descriptions such as “unaware” and “trying to escape” can lead to conflicting interpretations, thus
              complicating the image generation process.
            </p>
            <p>
              To overcome this challenge, we introduce the <b>Semantic Alignment Assessment (SAA)</b> module. This
              module calculates the relevance between different object vectors, thereby selecting the candidate
              description that best fits the current scenario. By conducting the cosine similarity among different
              descriptions, we can navigate the complexities introduced by the LLM's output, selecting the most
              compatible details for the diffusion model. This step is crucial for maintaining the coherence and
              accuracy of the generated images, highlighting our novel approach to mitigating the risk of conflicting
              descriptions. Through this module, we ensure textual precision and compatibility, and provide <i>clear,
                consistent instructions</i> for the subsequent diffusion model to generate visually coherent
              representations.
            </p>
            <img id="method" src="./static/images/fig_SAA.jpg" alt="method" height="100%">
          </div>
        </div>
      </div>
      <!-- end Method. -->
    </div>
  </section>

  <section class="section">
    <div class="container is-max-desktop">
      <!-- Qualitative Result. -->
      <div class="columns is-centered">
        <div class="column is-full-width">
          <h2 class="title is-3">Qualitative Result</h2>
          <div class="image-container">
            <figure><img id="result" src="./static/images/result.jpg" alt="result" height="100%">
              <figcaption>Qualitative comparison on RFBench. The compared models include (a) Stable
                Diffusion, (b) MultiDiffusion, (c) Attend and Excite, (d) LMD, (e) BoxDiff, (f) SDXL,
                (g) Ours</figcaption>
            </figure>
            <figure><img id="result" src="./static/images/more_fig1.png" alt="result" height="100%">
              <figcaption>More results on <b>Realistic and Analytical</b>. The compared models include (a)
                Stable Diffusion, (b) MultiDiffusion, (c) Attend and Excite, (d) LMD, (e) BoxDiff, (f)
                SDXL, (g) Ours
              </figcaption>
            </figure>
            <figure><img id="result" src="./static/images/more_fig2.png" alt="result" height="100%">
              <figcaption>More results on <b>Creativity and Imagination</b>. The compared models include
                (a) Stable Diffusion, (b) MultiDiffusion, (c) Attend and Excite, (d) LMD, (e) BoxDiff,
                (f) SDXL, (g) Ours
              </figcaption>
            </figure>
          </div>
        </div>
      </div>

      <!-- Quantitative Result. -->
      <div class="columns is-centered">
        <div class="column is-full-width">
          <h2 class="title is-3">Quantitative Result</h2>
          <table>
            <thead>
              <tr>
                <th rowspan="2" style="vertical-align: middle;" align="center">Model</th>
                <th colspan="3" align="center">GPT4-CLIP</th>
                <th colspan="3" align="center">GPT4Score</th>
              </tr>
              <tr align="center">
                <th>R & A</th>
                <th>C & I</th>
                <th>Avg</th>
                <th>R & A</th>
                <th>C & I</th>
                <th>Avg</th>
              </tr>
            </thead>
            <tbody align="center">
              <tr>
                <td>Stable Diffusion</td>
                <td>0.573</td>
                <td>0.552</td>
                <td>0.561</td>
                <td>0.667</td>
                <td>0.440</td>
                <td>0.541</td>
              </tr>
              <tr>
                <td>MultiDiffusion</td>
                <td>0.510</td>
                <td>0.510</td>
                <td>0.510</td>
                <td>0.517</td>
                <td>0.493</td>
                <td>0.504</td>
              </tr>
              <tr>
                <td>Attend and Excite</td>
                <td>0.523</td>
                <td>0.560</td>
                <td>0.546</td>
                <td>0.633</td>
                <td>0.520</td>
                <td>0.570</td>
              </tr>
              <tr>
                <td>LLM-groundedDiffusion</td>
                <td>0.457</td>
                <td>0.536</td>
                <td>0.501</td>
                <td>0.550</td>
                <td>0.600</td>
                <td>0.578</td>
              </tr>
              <tr>
                <td>BoxDiff</td>
                <td>0.532</td>
                <td>0.553</td>
                <td>0.543</td>
                <td>0.583</td>
                <td>0.520</td>
                <td>0.548</td>
              </tr>
              <tr>
                <td>SDXL</td>
                <td>0.536</td>
                <td>0.619</td>
                <td>0.582</td>
                <td>0.567</td>
                <td>0.587</td>
                <td>0.578</td>
              </tr>
              <tr class="highlight">
                <td><b>RFNet (ours)</b></td>
                <td><b>0.587 </b><span class="improvement">(2%↑)</span></td>
                <td><b>0.623 </b><span class="improvement">(13%↑)</span></td>
                <td><b>0.607 </b><span class="improvement">(8%↑)</span></td>
                <td><b>0.833 </b><span class="improvement">(25%↑)</span></td>
                <td><b>0.627 </b><span class="improvement">(43%↑)</span></td>
                <td><b>0.719 </b><span class="improvement">(33%↑)</span></td>
              </tr>
            </tbody>
          </table>
        </div>
      </div>

    </div>
  </section>

  <section class="section" id="BibTeX">
    <div class="container is-max-desktop content">
      <h2 class="title">BibTeX</h2>
      <pre><code>@article{yao2024fabricationrealityfantasyscene,
    title          = {The Fabrication of Reality and Fantasy: Scene Generation with LLM-Assisted Prompt Interpretation}, 
    author         = {Yi Yao and Chan-Feng Hsu and Jhe-Hao Lin and Hongxia Xie and Terence Lin and Yi-Ning Huang and Hong-Han Shuai and Wen-Huang Cheng},
    year           = {2024},
    eprint         = {2407.12579},
    archivePrefix  = {arXiv},
    primaryClass   = {cs.CV},
    url            = {https://arxiv.org/abs/2407.12579}, 
  }</code></pre>
    </div>
  </section>


  <footer class="footer">
    <!-- <div class="container">
      <div class="content has-text-centered">
        <a class="icon-link" href="">
          <i class="fas fa-file-pdf"></i>
        </a>
        <a class="icon-link" href="" class="external-link" disabled>
          <i class="fab fa-github"></i>
        </a>
      </div> -->
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">
          <p>
            This website is adapted from <a href="https://github.com/nerfies/nerfies.github.io">Nerfies</a>,
            licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
              Commons Attribution-ShareAlike 4.0 International License</a>.
          </p>
        </div>
      </div>
    </div>
    </div>
  </footer>

</body>

</html>