Spaces:

huawei-bayerlab
/

windowseat-reflection-removal-web

Running

App Files Files Community

windowseat-reflection-removal-web / index.html

anton-bayerlab

update index

2e72cd0 2 days ago

raw

history blame contribute delete

24.9 kB

	<!DOCTYPE html>
	<html>
	<head>
	<meta charset="utf-8">
	<meta name="description"
	content="Reflection Removal through Efficient Adaptation of Diffusion Transformers">
	<meta name="keywords" content="reflection removal, diffusion models, transformers, image processing, computer vision">
	<meta name="viewport" content="width=device-width, initial-scale=1">

	<meta name="twitter:card" content="summary">
	<meta name="twitter:image:src" content="http://marigoldmonodepth.github.io/images/marigold_logo_square.jpg">
	<meta name="twitter:title" content="WindowSeat">
	<meta name="twitter:description" content="Reflection Removal through Efficient Adaptation of Diffusion Transformers">
	<meta name="twitter:creator" content="@AntonObukhov1">

	<title>Reflection Removal through Efficient Adaptation of Diffusion Transformers</title>

	<!-- Google tag (gtag.js) -->
	<script async src="https://www.googletagmanager.com/gtag/js?id=G-1FWSVCGZTG"></script>
	<script>
	window.dataLayer = window.dataLayer \|\| [];

	function gtag() {
	dataLayer.push(arguments);
	}

	gtag('js', new Date());

	gtag('config', 'G-1FWSVCGZTG');
	</script>

	<link href="https://fonts.googleapis.com/css?family=Google+Sans\|Noto+Sans\|Castoro"
	rel="stylesheet">

	<link rel="stylesheet" href="./css/bulma.min.css">
	<link rel="stylesheet" href="./css/bulma-carousel.min.css">
	<link rel="stylesheet" href="./css/bulma-slider.min.css">
	<link rel="stylesheet" href="./css/twentytwenty.css">
	<link rel="stylesheet" href="./css/index.css">
	<link rel="icon" href="./images/windowseat_icon.svg">
	<link rel="stylesheet" href="./css/comparison-widget.css?v=1">

	<script src="./js/jquery-3.2.1.min.js"></script>
	<script src="./js/jquery.event.move.js"></script>
	<script src="./js/jquery.twentytwenty.js"></script>
	<script src="./js/bulma-carousel.min.js"></script>
	<script src="./js/bulma-slider.min.js"></script>
	<script src="./js/fontawesome.all.min.js"></script>
	<script src="./js/comparison-widget.js"></script>

	<!--MathJax-->
	<script>
	window.MathJax = {
	tex: {
	inlineMath: [['$', '$'], ['\$', '\$']]
	},
	svg: {
	fontCache: 'global'
	}
	};
	</script>
	<script type="text/javascript" id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
	</head>
	<body>


	<section class="hero">
	<div class="hero-body">
	<div class="container is-max-desktop">
	<div class="columns is-centered">
	<div class="column has-text-centered">
	<h1 class="title is-1 publication-title">Reflection Removal through Efficient Adaptation of Diffusion Transformers</h1>
	<div class="is-size-5 publication-authors">
	<span class="author-block">
	<a href="https://github.com/daniyarzt" target="_blank">Daniyar Zakarin</a><sup>*,1,2</sup>,</span>
	<span class="author-block">
	<a href="" target="_blank">Thiemo Wandel</a><sup>*,2</sup>,</span>
	<span class="author-block">
	<a href="https://www.obukhov.ai/" target="_blank">Anton Obukhov</a><sup>†,2</sup>,
	</span>
	<span class="author-block">
	<a href="https://scholar.google.ch/citations?user=T51W57YAAAAJ&hl=en" target="_blank">Dengxin Dai</a><sup>2</sup>
	</span>
	</div>

	<div class="is-size-5 publication-authors">
	<span class="author-block"><sup>1</sup>ETH Zurich</span>
	<span class="author-block"><sup>2</sup>HUAWEI Bayer Lab</span>
	<span class="author-block">*Equal contributors, internship work</span>
	<span class="author-block">†Project lead</span>
	</div>

	<div class="column has-text-centered">
	<div class="publication-links">
	<!-- Paper -->
	<span class="link-block">
	<a href="https://arxiv.org/abs/2512.05000" target="_blank" rel="noopener noreferrer"
	class="external-link button is-normal is-rounded is-dark">
	<span class="icon">
	<i class="fas fa-file-pdf" style="color: orangered"></i>
	</span>
	<span>Paper</span>
	</a>
	</span>
	<!-- Code -->
	<span class="link-block">
	<a href="https://github.com/huawei-bayerlab/windowseat-reflection-removal" target="_blank" rel="noopener noreferrer" class="external-link button is-normal is-rounded is-dark">
	<span class="icon">
	<i class="fab fa-github"></i>
	</span>
	<span>
	Code
	</span>
	</a>
	</span>
	<!-- Model -->
	<span class="link-block">
	<a href="https://hf.co/huawei-bayerlab/windowseat-reflection-removal-v1-0" target="_blank" rel="noopener noreferrer" class="external-link button is-normal is-rounded is-dark">
	<span class="icon">
	🤗
	</span>
	<span>
	Model
	</span>
	</a>
	</span>
	</div>
	</div>
	</div>
	</div>
	</div>
	</div>
	</section>

	<section class="hero teaser">
	<div class="is-max-desktop">
	<div class="hero-body">
	<img id="teaser" width="100%" src="./images/windowseat_teaser.jpg" alt="Teaser image demonstrating Marigold depth estimation."/>
	<h2 class="subtitle has-text-centered mt-4">
	We present <span class="methodname">WindowSeat</span>, a model and fine-tuning protocol for one-step single-image reflection removal.
	</h2>
	</div>
	</div>
	</section>


	<section class="section pt-0">
	<div class="container is-max-desktop">
	<!-- Abstract. -->
	<div class="columns is-centered has-text-centered">
	<div class="column is-four-fifths">
	<h2 class="title is-3">Abstract</h2>
	<div class="content has-text-justified">
	<p>
	We introduce a diffusion-transformer (DiT) framework for single-image reflection removal that leverages the generalization strengths of foundation diffusion models
	in the restoration setting.
	Rather than relying on task-specific architectures, we repurpose a pre-trained DiT-based foundation model by conditioning it on reflection-contaminated inputs and guiding it toward clean transmission layers.
	We systematically analyze existing reflection removal data sources for diversity, scalability, and photorealism.
	To address the shortage of suitable data, we construct a physically based rendering (PBR) pipeline in Blender, built around the Principled BSDF, to synthesize realistic glass materials and reflection effects.
	Efficient LoRA-based adaptation of the foundation model, combined with the proposed synthetic data, achieves state-of-the-art performance on in-domain and zero-shot benchmarks.
	These results demonstrate that pretrained diffusion transformers, when paired with physically grounded data synthesis and efficient adaptation, offer a scalable and high-fidelity solution for reflection removal.
	</p>

	</div>
	</div>
	</div>
	<!--/ Abstract. -->
	</div>
	</section>


	<section class="section pt-0 pb-4">
	<div class="container is-max-desktop">
	<div class="columns is-centered has-text-centered">
	<div class="column is-four-fifths">
	<h2 class="title is-3">Gallery</h2>
	</div>
	</div>
	</div>
	</section>

	<!-- Results Carousel. -->
	<section class="hero is-light is-small mt-4">
	<div class="hero-body">
	<!-- <div class="container"> -->
	<div id="results-carousel-horizontal" class="carousel results-carousel">

	<div class="twoitem">
	<div class="twentytwenty-container twentytwenty-container-top">
	<div class="cmpcontent">
	<img src="./images/comparison/car/original.jpg">
	</div>
	<div class="cmpcontent">
	<img src="./images/comparison/car/windowseat.png">
	</div>
	</div>
	</div>

	<div class="twoitem">
	<div class="twentytwenty-container twentytwenty-container-top">
	<div class="cmpcontent">
	<img src="./images/comparison/cafe/original.jpg">
	</div>
	<div class="cmpcontent">
	<img src="./images/comparison/cafe/windowseat.png">
	</div>
	</div>
	</div>

	<div class="twoitem">
	<div class="twentytwenty-container twentytwenty-container-top">
	<div class="cmpcontent">
	<img src="./images/comparison/entrance/original.jpg">
	</div>
	<div class="cmpcontent">
	<img src="./images/comparison/entrance/windowseat.png">
	</div>
	</div>
	</div>

	<div class="twoitem">
	<div class="twentytwenty-container twentytwenty-container-top">
	<div class="cmpcontent">
	<img src="./images/comparison/car_wheel/original.jpg">
	</div>
	<div class="cmpcontent">
	<img src="./images/comparison/car_wheel/windowseat.png">
	</div>
	</div>
	</div>

	<div class="twoitem">
	<div class="twentytwenty-container twentytwenty-container-top">
	<div class="cmpcontent">
	<img src="./images/comparison/bakery/original.jpg">
	</div>
	<div class="cmpcontent">
	<img src="./images/comparison/bakery/windowseat.png">
	</div>
	</div>
	</div>

	<div class="twoitem">
	<div class="twentytwenty-container twentytwenty-container-top">
	<div class="cmpcontent">
	<img src="./images/comparison/misty_train/original.jpg">
	</div>
	<div class="cmpcontent">
	<img src="./images/comparison/misty_train/windowseat.png">
	</div>
	</div>
	</div>

	<div class="twoitem">
	<div class="twentytwenty-container twentytwenty-container-top">
	<div class="cmpcontent">
	<img src="./images/comparison/wolf/original.jpg">
	</div>
	<div class="cmpcontent">
	<img src="./images/comparison/wolf/windowseat.png">
	</div>
	</div>
	</div>

	<div class="twoitem">
	<div class="twentytwenty-container twentytwenty-container-top">
	<div class="cmpcontent">
	<img src="./images/comparison/zoo/original.jpg">
	</div>
	<div class="cmpcontent">
	<img src="./images/comparison/zoo/windowseat.png">
	</div>
	</div>
	</div>

	<div class="twoitem">
	<div class="twentytwenty-container twentytwenty-container-top">
	<div class="cmpcontent">
	<img src="./images/comparison/airport/original.jpg">
	</div>
	<div class="cmpcontent">
	<img src="./images/comparison/airport/windowseat.png">
	</div>
	</div>
	</div>

	<div class="twoitem">
	<div class="twentytwenty-container twentytwenty-container-top">
	<div class="cmpcontent">
	<img src="./images/comparison/phone_booth/original.jpg">
	</div>
	<div class="cmpcontent">
	<img src="./images/comparison/phone_booth/windowseat.png">
	</div>
	</div>
	</div>
	</div>
	<!-- </div> -->
	</div>
	</section>

	<section class="hero is-light is-small">
	<div class="hero-body pt-0">
	<!-- <div class="container"> -->
	<div id="results-carousel-vertical" class="carousel results-carousel">

	<div class="twoitem">
	<div class="twentytwenty-container twentytwenty-container-top">
	<div class="cmpcontent">
	<img src="images/comparison/uniqlo/original.jpg">
	</div>
	<div class="cmpcontent">
	<img src="images/comparison/uniqlo/windowseat.png">
	</div>
	</div>
	</div>

	<div class="twoitem">
	<div class="twentytwenty-container twentytwenty-container-top">
	<div class="cmpcontent">
	<img src="images/comparison/museum/original.jpg">
	</div>
	<div class="cmpcontent">
	<img src="images/comparison/museum/windowseat.png">
	</div>
	</div>
	</div>

	<div class="twoitem">
	<div class="twentytwenty-container twentytwenty-container-top">
	<div class="cmpcontent">
	<img src="images/comparison/store_front/original.jpg">
	</div>
	<div class="cmpcontent">
	<img src="images/comparison/store_front/windowseat.png">
	</div>
	</div>
	</div>

	<div class="twoitem">
	<div class="twentytwenty-container twentytwenty-container-top">
	<div class="cmpcontent">
	<img src="images/comparison/airplane_phone/original.png">
	</div>
	<div class="cmpcontent">
	<img src="images/comparison/airplane_phone/windowseat.png">
	</div>
	</div>
	</div>

	<div class="twoitem">
	<div class="twentytwenty-container twentytwenty-container-top">
	<div class="cmpcontent">
	<img src="images/comparison/park_cart/original.jpg">
	</div>
	<div class="cmpcontent">
	<img src="images/comparison/park_cart/windowseat.jpg">
	</div>
	</div>
	</div>

	<div class="twoitem">
	<div class="twentytwenty-container twentytwenty-container-top">
	<div class="cmpcontent">
	<img src="images/comparison/pharaoh/original.jpg">
	</div>
	<div class="cmpcontent">
	<img src="images/comparison/pharaoh/windowseat.png">
	</div>
	</div>
	</div>
	</div>
	<!-- </div> -->
	</div>
	</section>

	<!-- Baseline Comparison Title -->
	<section class="section pt-6 pb-4">
	<div class=" is-max-desktop">
	<div class="columns is-centered has-text-centered">
	<div class="column is-four-fifths">
	<h2 class="title is-3">Qualitative Comparison with Other Recent Methods</h2>
	</div>
	</div>
	</div>
	</section>

	<!-- Combined Gallery and Comparison Widget with Light Background -->
	<section class="hero is-light is-small mt-4">
	<div class="hero-body">
	<div class="is-max-desktop">
	<!-- Image Gallery -->
	<div class="ws-gallery-grid mb-5">
	<div class="ws-gallery-item is-active" data-scene="car">
	<img src="./images/comparison/car/original.jpg" alt="Car scene" />
	</div>
	<div class="ws-gallery-item" data-scene="uniqlo">
	<img src="./images/comparison/uniqlo/original.jpg" alt="Uniqlo scene" />
	</div>
	<div class="ws-gallery-item" data-scene="entrance">
	<img src="./images/comparison/entrance/original.jpg" alt="Entrance scene" />
	</div>
	<div class="ws-gallery-item" data-scene="zoo">
	<img src="./images/comparison/zoo/original.jpg" alt="Zoo scene" />
	</div>
	<div class="ws-gallery-item" data-scene="bakery">
	<img src="./images/comparison/bakery/original.jpg" alt="Bakery scene" />
	</div>
	<div class="ws-gallery-item" data-scene="museum">
	<img src="./images/comparison/museum/original.jpg" alt="Museum scene" />
	</div>

	<!-- <div class="ws-gallery-item" data-scene="nature_pink_pole">
	<img src="./images/comparison/nature_pink_pole/original.jpg" alt="Nature: 3_119" />
	</div>
	<div class="ws-gallery-item" data-scene="postcards_008">
	<img src="./images/comparison/postcards_008/original.png" alt="Postcards: 008" />
	</div>
	<div class="ws-gallery-item" data-scene="postcards_050">
	<img src="./images/comparison/postcards_050/original.png" alt="Postcards: 050 " />
	</div>
	<div class="ws-gallery-item" data-scene="nature_3_134">
	<img src="./images/comparison/nature_3_134/original.jpg" alt="Nature: 3_134" />
	</div>
	<div class="ws-gallery-item" data-scene="real_110">
	<img src="./images/comparison/real_110/original.jpg" alt="Real: 110" />
	</div>
	<div class="ws-gallery-item" data-scene="wild_026">
	<img src="./images/comparison/wild_026/original.jpg" alt="Wild: 026" />
	</div> -->
	</div>

	<!-- Comparison Widget -->
	<div class="columns is-variable is-2-desktop">
	<!-- WindowSeat vs Baseline comparison with slider -->
	<div class="column">
	<div class="ws-card">
	<div class="ws-card-body ws-card-body-relative">
	<div class="twentytwenty-container twentytwenty-container-top" id="ws-comparison-slider">
	<div class="cmpcontent">
	<img
	id="ws-image-baseline"
	src="./images/comparison/car/dai.png"
	alt="Baseline output"
	/>
	</div>
	<div class="cmpcontent">
	<img
	id="ws-image-windowseat"
	src="./images/comparison/car/windowseat.png"
	alt="WindowSeat output"
	/>
	</div>
	</div>

	<!-- Model selector for baseline comparison -->
	<div class="ws-model-selector-overlay">
	<button class="ws-model-pill is-active" data-model="dai">DAI</button>
	<button class="ws-model-pill" data-model="dsit">DSIT</button>
	<button class="ws-model-pill" data-model="dsrnet">DSRNet</button>
	<button class="ws-model-pill" data-model="rdnet">RDNet</button>
	</div>
	</div>
	</div>
	</div>
	</div>
	</div>
	</div>
	</section>

	<section class="section">
	<div class="container is-max-desktop">
	<!-- Method. -->
	<div class="columns is-centered has-text-centered">
	<div class="column is-four-fifths">
	<h2 class="title is-3">Our Approach</h2>
	<div class="content has-text-justified">
	<h3 class="title has-text-centered">
	Physically Based Rendering Pipeline for Data Generation
	</h3>

	<p>
	Our method’s PBR pipeline generates realistic reflection-contaminated training data by simulating true light–glass interaction
	inside a lightweight Blender setup. It uses the Principled BSDF to control a surface’s physical properties—such as index of refraction,
	thickness, and roughness—allowing the system to reproduce ghosting, blur, scattering, and high-intensity highlights that simple alpha blending cannot capture.
	</p>

	</div></div></div></div>
	<div class="is-max-desktop">
	<div class="columns is-centered has-text-centered">
	<div class="column is-four-fifths">
	<div class="content has-text-justified">
	<img id="method_train" width="100%" src="./images/pbr_figure.jpg" alt="Marigold training scheme"/>
	</div></div></div></div>
	<div class="container is-max-desktop">
	<div class="columns is-centered has-text-centered">
	<div class="column is-four-fifths">
	<div class="content has-text-justified">

	<h3 class="title has-text-centered mt-4">
	Fine-tuning Protocol for Modern Diffusion Models
	</h3>

	<p>
	Our model repurposes a large diffusion transformer as a feed-forward reflection-removal network by operating directly
	in the VAE's latent space and training only lightweight LoRA adapters. During training, the network receives the encoded
	latent of a reflection-contaminated image and predicts a latent-space update that produces a clean transmission result
	in a single step, avoiding multi-stage diffusion or auxiliary modules. High-quality PBR data is a key ingredient that
	ensures that this fine-tuning protocol can be applied to future LDMs without major modifications.
	</p>

	<img id="method_inference" width="100%" src="./images/windowseat_training.png" alt="Marigold inference scheme"/>

	<h3 class="title has-text-centered mt-0">
	Quantitative Comparison with Other Recent Methods
	</h3>

	<p>
	Our model consistently outperforms prior reflection-removal methods across both in-domain datasets and challenging zero-shot benchmarks.
	It delivers higher PSNR and SSIM scores than existing approaches,
	including diffusion-based, transformer-based, and dual-stream architectures,
	and shows especially large gains on the SIR2 benchmarks, where it improves zero-shot PSNR by more than 1.5 dB and achieves the highest perceptual quality metrics (MS-SSIM and LPIPS).
	Qualitative comparisons further show that it handles strong, complex, and high-frequency reflections with fewer artifacts,
	while other methods often leave reflections partially intact or introduce distortions.
	Overall, the model sets a new performance level while requiring a simpler architecture and more efficient training.
	</p>

	</div></div></div></div>
	<div class="is-max-desktop">
	<div class="columns is-centered has-text-centered">
	<div class="column is-four-fifths">
	<div class="content has-text-justified">
	<img id="comparison" width="100%" src="./images/both_result_tables.png" alt="Quantitative Comparison with Other Recent Methods"/>
	</div></div></div></div>
	<div class="container is-max-desktop">
	<div class="columns is-centered has-text-centered">
	<div class="column is-four-fifths">
	<div class="content has-text-justified">

	<p class="mt-4 mb-0 pb-0">
	Refer to the pdf paper linked above for more details on qualitative, quantitative, and ablation studies.
	</p>
	</div>
	</div>
	</div>
	<!--/ Method. -->
	</div>
	</section>

	<script>
	$(window).on('load', function() {
	bulmaCarousel.attach('#results-carousel-horizontal', {
	slidesToScroll: 1,
	slidesToShow: 3,
	loop: true,
	autoplay: true,
	});

	bulmaCarousel.attach('#results-carousel-vertical', {
	slidesToScroll: 1,
	slidesToShow: 5,
	loop: true,
	autoplay: true,
	});

	bulmaCarousel.attach('#results-carousel-testimonials', {
	slidesToScroll: 1,
	slidesToShow: 3,
	loop: true,
	autoplay: true,
	});

	$(".twentytwenty-container-top").twentytwenty({
	before_label: 'Input',
	after_label: 'WindowSeat',
	default_offset_pct: 0.4,
	});

	// Initialize comparison widget slider with different labels
	$("#ws-comparison-slider").twentytwenty({
	before_label: 'DAI',
	after_label: 'WindowSeat',
	default_offset_pct: 0.5,
	});

	$('.results-carousel').css({
	'max-height': '2000px',
	'visibility': 'visible'
	});
	});
	</script>

	<section class="section mt-0 pt-0" id="BibTeX">
	<div class="container is-max-desktop content">
	<h2 class="title">Citation</h2>
	<pre class="selectable">
	@misc{zakarin2025reflectionremovalefficientadaptation,
	title = {Reflection Removal through Efficient Adaptation of Diffusion Transformers},
	author = {Daniyar Zakarin and Thiemo Wandel and Anton Obukhov and Dengxin Dai},
	year = {2025},
	eprint = {2512.05000},
	archivePrefix= {arXiv},
	primaryClass = {cs.CV},
	url = {https://arxiv.org/abs/2512.05000},
	}
	</pre>
	</div>
	</section>

	<footer class="footer pt-4 pb-0">
	<div class="container">
	<div class="columns is-centered">
	<div class="column is-8">
	<div class="content">
	<p>
	Website template based on
	<a href="https://github.com/nerfies/nerfies.github.io">
	Nerfies
	</a>
	and licensed under
	<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">
	CC-BY-SA-4.0
	</a>.
	</p>
	</div>
	</div>
	</div>
	</div>
	</footer>

	</body>
	</html>