Spaces:

Leyogho
/

Core-AI-IMAGE

Build error

App Files Files Community

Leyogho commited on Dec 16, 2024

Commit

edebe10

1 Parent(s): f53de7a

Core

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

__init__.py +0 -0
app.py +102 -4
assets/readmes/DATASET.md +42 -0
assets/readmes/biomedparse_prediction_examples.png +0 -0
assets/requirements/requirements_custom.txt +6 -0
assets/scripts/eval.sh +21 -0
assets/scripts/train.sh +41 -0
biomedparse_working.txt +182 -0
configs/biomed_seg_lang_v1.yaml +329 -0
configs/biomedparse_inference.yaml +196 -0
datasets/__init__.py +2 -0
datasets/build.py +630 -0
datasets/dataset_mappers/__init__.py +1 -0
datasets/dataset_mappers/biomed_dataset_mapper.py +378 -0
datasets/evaluation/__init__.py +8 -0
datasets/evaluation/captioning_evaluation.py +129 -0
datasets/evaluation/classification_evaluation.py +76 -0
datasets/evaluation/grounding_evaluation.py +173 -0
datasets/evaluation/instance_evaluation.py +107 -0
datasets/evaluation/interactive_evaluation.py +122 -0
datasets/evaluation/panoptic_evaluation.py +199 -0
datasets/evaluation/retrieval_evaluation.py +260 -0
datasets/evaluation/segmentation_evaluation.py +195 -0
datasets/refer.py +371 -0
datasets/registration/__init__.py +3 -0
datasets/registration/register_biomed_datasets.py +123 -0
datasets/semseg_loader.py +10 -0
datasets/utils/refcoco2json.py +41 -0
datasets/utils/refer.py +372 -0
datasets/visual_sampler/__init__.py +12 -0
datasets/visual_sampler/circle.py +106 -0
datasets/visual_sampler/mask_generators.py +215 -0
datasets/visual_sampler/point.py +74 -0
datasets/visual_sampler/polygon.py +137 -0
datasets/visual_sampler/sampler.py +77 -0
datasets/visual_sampler/scribble.py +96 -0
datasets/visual_sampler/simpleclick_sampler.py +252 -0
docker/Dockerfile +32 -0
docker/README.md +9 -0
docker/data_env.sh +1 -0
docker/docker_build.sh +1 -0
docker/docker_run.sh +1 -0
docker/setup_inside_docker.sh +10 -0
entry.py +92 -0
environment.yml +149 -0
example_prediction.py +47 -0
examples/144DME_as_F.jpeg +0 -0
examples/C3_EndoCV2021_00462.jpg +0 -0
examples/CT_lung_nodule.dcm +0 -0
examples/LIDC-IDRI-0140_143_280_CT_lung.png +0 -0

__init__.py ADDED Viewed

File without changes

app.py CHANGED Viewed

@@ -1,7 +1,105 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+# Imports standard
+import torch
+import numpy as np
+from PIL import Image
+import matplotlib.pyplot as plt
 import gradio as gr
+import os
+# Imports Hugging Face
+from huggingface_hub import hf_hub_download, login
+from google.colab import userdata
+# Imports locaux
+from modeling.BaseModel import BaseModel
+from modeling import build_model
+from utilities.distributed import init_distributed
+from utilities.arguments import load_opt_from_config_files
+from utilities.constants import BIOMED_CLASSES
+from inference_utils.inference import interactive_infer_image
+from inference_utils.output_processing import check_mask_stats
+from inference_utils.processing_utils import read_rgb, get_instances
+def init_huggingface():
+    """Initialise la connexion Hugging Face et télécharge le modèle."""
+    login(userdata.get('HF_TOKEN'))
+    return hf_hub_download(
+        repo_id="microsoft/BiomedParse",
+        filename="biomedparse_v1.pt",
+        local_dir="pretrained"
+    )
+def setup_model():
+    """Configure et retourne le modèle."""
+    opt = init_distributed(opt)
+    model = BaseModel(opt, build_model(opt)).from_pretrained('hf_hub:microsoft/BiomedParse').eval().cuda()
+    with torch.no_grad():
+        model.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(
+            BIOMED_CLASSES + ["background"],
+            is_eval=True
+        )
+    return model
+def process_image(image, prompts, model):
+    """Traite l'image avec les prompts donnés."""
+    if isinstance(image, str):
+        image = Image.open(image)
+    else:
+        image = Image.fromarray(image)
+    prompts = [p.strip() for p in prompts.split(',')]
+    pred_masks = interactive_infer_image(model, image, prompts)
+    fig = plt.figure(figsize=(10, 5))
+    plt.subplot(1, len(pred_masks) + 1, 1)
+    plt.imshow(image)
+    plt.title('Image originale')
+    plt.axis('off')
+    for i, mask in enumerate(pred_masks):
+        plt.subplot(1, len(pred_masks) + 1, i+2)
+        plt.imshow(image)
+        plt.imshow(mask, alpha=0.5, cmap='Reds')
+        plt.title(prompts[i])
+        plt.axis('off')
+    return fig
+def setup_gradio_interface(model):
+    """Configure l'interface Gradio."""
+    return gr.Interface(
+        theme=gr.Theme.from_hub("allenai/gradio-theme"),
+        fn=lambda img, txt: process_image(img, txt, model),
+        inputs=[
+            gr.Image(type="numpy", label="Image médicale"),
+            gr.Textbox(
+                label="Prompts (séparés par des virgules)",
+                placeholder="edema, lesion, etc...",
+                elem_classes="white"
+            )
+        ],
+        outputs=gr.Plot(),
+        title="Core IA - Traitement d'image medicale",
+        description="Chargez une image médicale et spécifiez les éléments à segmenter",
+        examples=[
+            ["examples/144DME_as_F.jpeg", "Dans cette image donne moi l'œdème"],
+            ["examples/ISIC_0015551.jpg", "Cherche une lésion"],
+            ["examples/T0011.jpg", "disque optique, cupule optique"],
+            ["examples/C3_EndoCV2021_00462.jpg", "Trouve moi le polyp"],
+            ["examples/covid_1585.png", "Qu'est ce qui ne va pas ici ?"],
+            ['examples/Part_1_516_pathology_breast.png', "cellules néoplasiques , cellules inflammatoires ,  cellules du tissu conjonctif"]
+        ]
+    )
+def main():
+    """Point d'entrée principal de l'application."""
+    init_huggingface()
+    model = setup_model()
+    interface = setup_gradio_interface(model)
+    interface.launch(debug=True)
+if __name__ == "__main__":
+    main()

assets/readmes/DATASET.md ADDED Viewed

	@@ -0,0 +1,42 @@

+# **BiomedParseData**
+BiomedParseData was created from preprocessing publicly available biomedical image segmentation datasets.
+These datasets are provided pre-formatted for convenience. For additional information about the datasets or their licenses, please reach out to the owners:
+| Dataset                               | URL |
+|---------------------------------------|-----|
+| amos22                                | [https://amos22.grand-challenge.org/](https://amos22.grand-challenge.org/) |
+| MSD (Medical Segmentation Decathlon)  | [http://medicaldecathlon.com/](http://medicaldecathlon.com/) |
+| KiTS23                                | [https://github.com/neheller/kits23](https://github.com/neheller/kits23) |
+| BTCV                                  | [https://www.synapse.org/#!Synapse:syn3193805/wiki/217790](https://www.synapse.org/#!Synapse:syn3193805/wiki/217790) |
+| COVID-19 CT                           | [https://www.kaggle.com/datasets/andrewmvd/covid19-ct-scans](https://www.kaggle.com/datasets/andrewmvd/covid19-ct-scans) |
+| LIDR-IDRI                             | [https://wiki.cancerimagingarchive.net/display/Public/LIDC-IDRI](https://wiki.cancerimagingarchive.net/display/Public/LIDC-IDRI) |
+| ACDC                                  | [https://www.creatis.insa-lyon.fr/Challenge/acdc/databases.html](https://www.creatis.insa-lyon.fr/Challenge/acdc/databases.html) |
+| M&Ms                                  | [https://www.ub.edu/mnms/](https://www.ub.edu/mnms/) |
+| PROMISE12                             | [cite https://doi.org/10.1016/j.media.2013.12.002](https://doi.org/10.1016/j.media.2013.12.002) |
+| LGG                                   | [https://www.kaggle.com/datasets/mateuszbuda/lgg-mri-segmentation](https://www.kaggle.com/datasets/mateuszbuda/lgg-mri-segmentation) |
+| COVID-QU-Ex                           | [https://www.kaggle.com/datasets/anasmohammedtahir/covidqu](https://www.kaggle.com/datasets/anasmohammedtahir/covidqu) |
+| QaTa-COV19                            | [https://www.kaggle.com/datasets/aysendegerli/qatacov19-dataset](https://www.kaggle.com/datasets/aysendegerli/qatacov19-dataset) |
+| SIIM-ACR Pneumothorax Segmentation    | [https://www.kaggle.com/datasets/vbookshelf/pneumothorax-chest-xray-images-and-masks](https://www.kaggle.com/datasets/vbookshelf/pneumothorax-chest-xray-images-and-masks) |
+| Chest Xray Masks and Labels Dataset   | [https://datasetninja.com/chest-xray](https://datasetninja.com/chest-xray) |
+| COVID-19 Radiography Database         | [https://www.kaggle.com/datasets/tawsifurrahman/covid19-radiography-database](https://www.kaggle.com/datasets/tawsifurrahman/covid19-radiography-database) |
+| CAMUS                                 | [https://www.creatis.insa-lyon.fr/Challenge/camus/index.html](https://www.creatis.insa-lyon.fr/Challenge/camus/index.html) |
+| BUSI                                  | [https://scholar.cu.edu.eg/?q=afahmy/pages/dataset](https://scholar.cu.edu.eg/?q=afahmy/pages/dataset) |
+| FH-PS-AOP                             | [https://zenodo.org/records/7851339#.ZEH6eHZBztU](https://zenodo.org/records/7851339#.ZEH6eHZBztU) |
+| CDD-CESM                              | [https://www.cancerimagingarchive.net/collection/cdd-cesm/](https://www.cancerimagingarchive.net/collection/cdd-cesm/) |
+| PolypGen                              | [https://www.synapse.org/#!Synapse:syn26376615/wiki/613312](https://www.synapse.org/#!Synapse:syn26376615/wiki/613312) |
+| NeoPolyp                              | [https://www.kaggle.com/c/bkai-igh-neopolyp/data](https://www.kaggle.com/c/bkai-igh-neopolyp/data) |
+| ISIC 2018                             | [https://challenge2018.isic-archive.com/task1/](https://challenge2018.isic-archive.com/task1/) |
+| UwaterlooSkinCancer                   | [Skin Cancer Detection \| Vision and Image Processing Lab \| University of Waterloo](https://uwaterloo.ca) |
+| OCT-CME                               | [https://www.kaggle.com/datasets/zeeshanahmed13/intraretinal-cystoid-fluid](https://www.kaggle.com/datasets/zeeshanahmed13/intraretinal-cystoid-fluid) |
+| REFUGE                                | [https://bitbucket.org/woalsdnd/refuge/src](https://bitbucket.org/woalsdnd/refuge/src) |
+| G1020                                 | [https://www.dfki.uni-kl.de/g1020](https://www.dfki.uni-kl.de/g1020) |
+| DRIVE                                 | [https://drive.grand-challenge.org/](https://drive.grand-challenge.org/) |
+| GlaS                                  | [https://warwick.ac.uk/fac/cross_fac/tia/data/glascontest/](https://warwick.ac.uk/fac/cross_fac/tia/data/glascontest/) |
+| PanNuke                               | [https://jgamper.github.io/PanNukeDataset/](https://jgamper.github.io/PanNukeDataset/) |
+| FUMPE                                 | [https://figshare.com/collections/FUMPE/4107803/1](https://figshare.com/collections/FUMPE/4107803/1) |
+| TotalSegmentator                      | [https://github.com/wasserth/TotalSegmentator](https://github.com/wasserth/TotalSegmentator) |
+| BraTS2023                             | [https://www.synapse.org/#!Synapse:syn51156910/wiki/621282](https://www.synapse.org/#!Synapse:syn51156910/wiki/621282) |
+| AbdomenCT-1K                          | [https://github.com/JunMa11/AbdomenCT-1K](https://github.com/JunMa11/AbdomenCT-1K) |
+| US Simulation & Segmentation          | [https://www.kaggle.com/datasets/ignaciorlando/ussimandsegm](https://www.kaggle.com/datasets/ignaciorlando/ussimandsegm) |
+| CDD-CESM                              | [https://www.cancerimagingarchive.net/collection/cdd-cesm/](https://www.cancerimagingarchive.net/collection/cdd-cesm/) |

assets/readmes/biomedparse_prediction_examples.png ADDED Viewed

assets/requirements/requirements_custom.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+git+https://github.com/cocodataset/panopticapi.git
+git+https://github.com/openai/CLIP.git
+#git+https://github.com/arogozhnikov/einops.git
+#git+https://github.com/facebookresearch/detectron2.git
+git+https://github.com/MaureenZOU/detectron2-xyz.git
+#git+https://github.com/openai/whisper.git

assets/scripts/eval.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+export DETECTRON2_DATASETS=biomedparse_datasets/
+export DATASET=biomedparse_datasets/
+export DATASET2=biomedparse_datasets/
+export VLDATASET=biomedparse_datasets/
+export PATH=$PATH:biomedparse_datasets/coco_caption/jre1.8.0_321/bin/
+export PYTHONPATH=$PYTHONPATH:biomedparse_datasets/coco_caption/
+export OMPI_ALLOW_RUN_AS_ROOT=1
+export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
+#export WANDB_KEY=YOUR_WANDB_KEY # Provide your wandb key here
+CUDA_VISIBLE_DEVICES=0 mpirun -n 1 python entry.py evaluate \
+            --conf_files configs/biomed_seg_lang_v1.yaml \
+            --overrides \
+            MODEL.DECODER.HIDDEN_DIM 512 \
+            MODEL.ENCODER.CONVS_DIM 512 \
+            MODEL.ENCODER.MASK_DIM 512 \
+            TEST.BATCH_SIZE_TOTAL 1 \
+            FP16 True \
+            WEIGHT True \
+            STANDARD_TEXT_FOR_EVAL False \
+            RESUME_FROM pretrained/biomedparse_v1.pt \

assets/scripts/train.sh ADDED Viewed

	@@ -0,0 +1,41 @@

+export DETECTRON2_DATASETS=biomedparse_datasets/
+export DATASET=biomedparse_datasets/
+export DATASET2=biomedparse_datasets/
+export VLDATASET=biomedparse_datasets/
+export PATH=$PATH:biomedparse_datasets/coco_caption/jre1.8.0_321/bin/
+export PYTHONPATH=$PYTHONPATH:biomedparse_datasets/coco_caption/
+export OMPI_ALLOW_RUN_AS_ROOT=1
+export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
+#export WANDB_KEY=YOUR_WANDB_KEY # Provide your wandb key here
+CUDA_VISIBLE_DEVICES=0 mpirun -n 1 python entry.py train \
+            --conf_files configs/biomed_seg_lang_v1.yaml \
+            --overrides \
+            FP16 True \
+            RANDOM_SEED 2024 \
+            BioMed.INPUT.IMAGE_SIZE 1024 \
+            MODEL.DECODER.HIDDEN_DIM 512 \
+            MODEL.ENCODER.CONVS_DIM 512 \
+            MODEL.ENCODER.MASK_DIM 512 \
+            TEST.BATCH_SIZE_TOTAL 4 \
+            TRAIN.BATCH_SIZE_TOTAL 4 \
+            TRAIN.BATCH_SIZE_PER_GPU 4 \
+            SOLVER.MAX_NUM_EPOCHS 20 \
+            SOLVER.BASE_LR 0.00001 \
+            SOLVER.FIX_PARAM.backbone False \
+            SOLVER.FIX_PARAM.lang_encoder False \
+            SOLVER.FIX_PARAM.pixel_decoder False \
+            MODEL.DECODER.COST_SPATIAL.CLASS_WEIGHT 1.0 \
+            MODEL.DECODER.COST_SPATIAL.MASK_WEIGHT 1.0 \
+            MODEL.DECODER.COST_SPATIAL.DICE_WEIGHT 1.0 \
+            MODEL.DECODER.TOP_SPATIAL_LAYERS 10 \
+            MODEL.DECODER.SPATIAL.ENABLED True \
+            MODEL.DECODER.GROUNDING.ENABLED True \
+            LOADER.SAMPLE_PROB prop \
+            BioMed.INPUT.RANDOM_ROTATE True \
+            FIND_UNUSED_PARAMETERS True \
+            ATTENTION_ARCH.SPATIAL_MEMORIES 32 \
+            MODEL.DECODER.SPATIAL.MAX_ITER 0 \
+            ATTENTION_ARCH.QUERY_NUMBER 3 \
+            STROKE_SAMPLER.MAX_CANDIDATE 10 \
+            WEIGHT True \
+            RESUME_FROM pretrained/biomedparse_v1.pt

biomedparse_working.txt ADDED Viewed

	@@ -0,0 +1,182 @@

+name: biomedparse
+channels:
+  - pytorch
+  - nvidia
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - blas=1.0=mkl
+  - brotli-python=1.0.9=py312h6a678d5_8
+  - bzip2=1.0.8=h5eee18b_6
+  - ca-certificates=2024.7.2=h06a4308_0
+  - certifi=2024.7.4=py312h06a4308_0
+  - charset-normalizer=3.3.2=pyhd3eb1b0_0
+  - cuda-cudart=12.4.127=0
+  - cuda-cupti=12.4.127=0
+  - cuda-libraries=12.4.0=0
+  - cuda-nvrtc=12.4.127=0
+  - cuda-nvtx=12.4.127=0
+  - cuda-opencl=12.6.68=0
+  - cuda-runtime=12.4.0=0
+  - cuda-version=12.6=3
+  - expat=2.6.2=h6a678d5_0
+  - ffmpeg=4.3=hf484d3e_0
+  - filelock=3.13.1=py312h06a4308_0
+  - freetype=2.12.1=h4a9f257_0
+  - gmp=6.2.1=h295c915_3
+  - gnutls=3.6.15=he1e5248_0
+  - idna=3.7=py312h06a4308_0
+  - intel-openmp=2023.1.0=hdb19cb5_46306
+  - jinja2=3.1.4=py312h06a4308_0
+  - jpeg=9e=h5eee18b_3
+  - lame=3.100=h7b6447c_0
+  - lcms2=2.12=h3be6417_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - lerc=3.0=h295c915_0
+  - libcublas=12.4.2.65=0
+  - libcufft=11.2.0.44=0
+  - libcufile=1.11.1.6=0
+  - libcurand=10.3.7.68=0
+  - libcusolver=11.6.0.99=0
+  - libcusparse=12.3.0.142=0
+  - libdeflate=1.17=h5eee18b_1
+  - libffi=3.4.4=h6a678d5_1
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libiconv=1.16=h5eee18b_3
+  - libidn2=2.3.4=h5eee18b_0
+  - libjpeg-turbo=2.0.0=h9bf148f_0
+  - libnpp=12.2.5.2=0
+  - libnvfatbin=12.6.68=0
+  - libnvjitlink=12.4.99=0
+  - libnvjpeg=12.3.1.89=0
+  - libpng=1.6.39=h5eee18b_0
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libtasn1=4.19.0=h5eee18b_0
+  - libtiff=4.5.1=h6a678d5_0
+  - libunistring=0.9.10=h27cfd23_0
+  - libuuid=1.41.5=h5eee18b_0
+  - libwebp-base=1.3.2=h5eee18b_0
+  - llvm-openmp=14.0.6=h9e868ea_0
+  - lz4-c=1.9.4=h6a678d5_1
+  - markupsafe=2.1.3=py312h5eee18b_0
+  - mkl=2023.1.0=h213fc3f_46344
+  - mkl-service=2.4.0=py312h5eee18b_1
+  - mkl_fft=1.3.10=py312h5eee18b_0
+  - mkl_random=1.2.7=py312h526ad5a_0
+  - mpmath=1.3.0=py312h06a4308_0
+  - ncurses=6.4=h6a678d5_0
+  - nettle=3.7.3=hbbd107a_1
+  - networkx=3.2.1=py312h06a4308_0
+  - openh264=2.1.1=h4ff587b_0
+  - openjpeg=2.5.2=he7f1fd0_0
+  - openssl=3.0.14=h5eee18b_0
+  - pip=24.2=py312h06a4308_0
+  - pysocks=1.7.1=py312h06a4308_0
+  - python=3.12.4=h5148396_1
+  - pytorch=2.4.1=py3.12_cuda12.4_cudnn9.1.0_0
+  - pytorch-cuda=12.4=hc786d27_6
+  - pytorch-mutex=1.0=cuda
+  - pyyaml=6.0.1=py312h5eee18b_0
+  - readline=8.2=h5eee18b_0
+  - requests=2.32.3=py312h06a4308_0
+  - setuptools=72.1.0=py312h06a4308_0
+  - sqlite=3.45.3=h5eee18b_0
+  - sympy=1.13.2=py312h06a4308_0
+  - tbb=2021.8.0=hdb19cb5_0
+  - tk=8.6.14=h39e8969_0
+  - torchaudio=2.4.1=py312_cu124
+  - torchtriton=3.0.0=py312
+  - torchvision=0.19.1=py312_cu124
+  - typing_extensions=4.11.0=py312h06a4308_0
+  - urllib3=2.2.2=py312h06a4308_0
+  - wheel=0.43.0=py312h06a4308_0
+  - xz=5.4.6=h5eee18b_1
+  - yaml=0.2.5=h7b6447c_0
+  - zlib=1.2.13=h5eee18b_1
+  - zstd=1.5.5=hc292b87_2
+  - pip:
+      - absl-py==2.1.0
+      - accelerate==0.23.0
+      - antlr4-python3-runtime==4.9.3
+      - appdirs==1.4.4
+      - black==21.4b2
+      - click==8.1.7
+      - clip==1.0
+      - cloudpickle==3.0.0
+      - contourpy==1.3.0
+      - cycler==0.12.1
+      - cython==3.0.2
+      - deepspeed==0.10.3
+      - detectron2==0.6
+      - diffdist==0.1
+      - einops==0.7.0
+      - fonttools==4.53.1
+      - fsspec==2024.9.0
+      - ftfy==6.1.1
+      - future==1.0.0
+      - fvcore==0.1.5.post20221221
+      - grpcio==1.66.1
+      - hjson==3.1.0
+      - huggingface-hub==0.17.3
+      - hydra-core==1.3.2
+      - imageio==2.35.1
+      - infinibatch==0.1.1
+      - iopath==0.1.9
+      - joblib==1.4.2
+      - json-tricks==3.17.3
+      - kiwisolver==1.4.7
+      - kornia==0.7.0
+      - lazy-loader==0.4
+      - markdown==3.7
+      - matplotlib==3.9.2
+      - mup==1.0.0
+      - mypy-extensions==1.0.0
+      - ninja==1.11.1.1
+      - nltk==3.8.1
+      - numpy==1.26.4
+      - omegaconf==2.3.0
+      - opencv-python==4.8.1.78
+      - packaging==24.1
+      - pandas==2.0.3
+      - panopticapi==0.1
+      - pathspec==0.12.1
+      - pillow==9.4.0
+      - portalocker==2.10.1
+      - protobuf==5.28.0
+      - psutil==6.0.0
+      - py-cpuinfo==9.0.0
+      - pycocotools==2.0.7
+      - pydantic==1.10.18
+      - pydot==3.0.1
+      - pyparsing==3.1.4
+      - python-dateutil==2.9.0.post0
+      - pytz==2024.1
+      - pywavelets==1.7.0
+      - regex==2023.10.3
+      - safetensors==0.4.4
+      - scikit-image==0.21.0
+      - scikit-learn==1.3.1
+      - scipy==1.14.1
+      - seaborn==0.13.2
+      - sentencepiece==0.1.99
+      - six==1.16.0
+      - tabulate==0.9.0
+      - tenacity==9.0.0
+      - tensorboard==2.17.1
+      - tensorboard-data-server==0.7.2
+      - termcolor==2.4.0
+      - threadpoolctl==3.5.0
+      - tifffile==2024.8.30
+      - timm==0.4.12
+      - tokenizers==0.14.1
+      - toml==0.10.2
+      - tqdm==4.66.5
+      - transformers==4.34.0
+      - tzdata==2024.1
+      - vision-datasets==0.2.2
+      - wcwidth==0.2.13
+      - werkzeug==3.0.4
+      - yacs==0.1.8
+prefix: /anaconda/envs/biomedparse

configs/biomed_seg_lang_v1.yaml ADDED Viewed

	@@ -0,0 +1,329 @@

+# --------------------------------------------------------
+# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Xueyan Zou ([email protected])
+# --------------------------------------------------------
+# Define Test/Trainer/Saving
+PIPELINE: XDecoderPipeline
+TRAINER: xdecoder
+SAVE_DIR: './output'
+base_path: "./"
+# Resume Logistic
+RESUME: false
+WEIGHT: false
+RESUME_FROM: ''
+EVAL_AT_START: false
+SAVE_CHECKPOINT: True
+# Logging and Debug
+WANDB: False
+LOG_EVERY: 100
+FIND_UNUSED_PARAMETERS: false
+# Speed up training
+FP16: false
+PORT: '36873'
+# misc
+LOADER:
+  JOINT: True
+  KEY_DATASET: ""
+  SAMPLE_PROB: "prop"    # sampling probability proportional to data size. Use "equal" for each bach from all datasets
+  MIXING_LEVEL: 1    # num of different datasets for batch mixing on each GPU
+RANDOM_SEED: 2024
+STANDARD_TEXT_FOR_EVAL: False
+##################
+# Task settings
+##################
+VERBOSE: true
+MODEL:
+  NAME: seem_model_v1
+  HEAD: xdecoder_head
+  MASK_ON: false
+  KEYPOINT_ON: false
+  LOAD_PROPOSALS: false
+  DIM_PROJ: 512
+  TEXT:
+    ARCH: vlpencoder
+    NAME: transformer
+    TOKENIZER: clip
+    CONTEXT_LENGTH: 77 #256 # 77
+    WIDTH: 512 # 768  # 512
+    HEADS: 8
+    LAYERS: 12 # 6
+    AUTOGRESSIVE: True
+  BACKBONE:
+    NAME: focal    # focal_dw    # focal
+    PRETRAINED: ''
+    LOAD_PRETRAINED: false
+    FOCAL:
+      PRETRAIN_IMG_SIZE: 224
+      PATCH_SIZE: 4
+      EMBED_DIM: 192    # 96    # 192
+      DEPTHS: [2, 2, 18, 2]    # [2, 2, 6, 2]    # [2, 2, 18, 2]
+      FOCAL_LEVELS: [4, 4, 4, 4]    # [3, 3, 3, 3]    # [4, 4, 4, 4]
+      FOCAL_WINDOWS: [3, 3, 3, 3]
+      DROP_PATH_RATE: 0.3
+      MLP_RATIO: 4.0
+      DROP_RATE: 0.0
+      PATCH_NORM: True
+      USE_CONV_EMBED: True
+      SCALING_MODULATOR: True
+      USE_CHECKPOINT: False
+      USE_POSTLN: true
+      USE_POSTLN_IN_MODULATION: false
+      USE_LAYERSCALE: True
+      OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+      OUT_INDICES: [0, 1, 2, 3]
+  ENCODER:
+    NAME: transformer_encoder_fpn
+    IGNORE_VALUE: 255
+    NUM_CLASSES: 16
+    BINARY_CLASSES: False
+    LOSS_WEIGHT: 1.0
+    CONVS_DIM: 512
+    MASK_DIM: 512
+    NORM: "GN"
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
+    COMMON_STRIDE: 4
+    TRANSFORMER_ENC_LAYERS: 6
+  DECODER:
+    NAME: seem_v1
+    TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
+    MASK:
+      ENABLED: True
+    DETECTION: False
+    SPATIAL:
+      ENABLED: True
+      MAX_ITER: 1
+    GROUNDING:
+      ENABLED: True
+      MAX_LEN: 10
+      TEXT_WEIGHT: 2.0
+      CLASS_WEIGHT: 0.5
+    RETRIEVAL:
+      ENABLED: False
+    LVIS:
+      ENABLED: False
+      THRES: 0.7
+    OPENIMAGE:
+      ENABLED: False
+      NEGATIVE_SAMPLES: 5
+      GROUNDING:
+        ENABLED: False
+        MAX_LEN: 5
+    CAPTION:
+      ENABLED: False
+      PHRASE_PROB: 0.5
+      SIM_THRES: 0.95
+    DEEP_SUPERVISION: True
+    NO_OBJECT_WEIGHT: 0.1
+    GCLASS_WEIGHT: 0.4
+    GMASK_WEIGHT: 1.0
+    GDICE_WEIGHT: 1.0
+    SCLASS_WEIGHT: 0.4
+    SMASK_WEIGHT: 1.0
+    SDICE_WEIGHT: 1.0
+    OCLASS_WEIGHT: 0.4
+    OMASK_WEIGHT: 1.0
+    ODICE_WEIGHT: 1.0
+    CLASS_WEIGHT: 2.0
+    MASK_WEIGHT: 5.0
+    DICE_WEIGHT: 5.0
+    BBOX_WEIGHT: 5.0
+    GIOU_WEIGHT: 2.0
+    CAPTION_WEIGHT: 2.0
+    COST_SPATIAL:
+      CLASS_WEIGHT: 5.0
+      MASK_WEIGHT: 2.0
+      DICE_WEIGHT: 2.0
+    HIDDEN_DIM: 512
+    NUM_OBJECT_QUERIES: 101
+    NHEADS: 8
+    DROPOUT: 0.0
+    DIM_FEEDFORWARD: 2048
+    MAX_SPATIAL_LEN: [512, 512, 512, 512]
+    # ENC_LAYERS: 0
+    PRE_NORM: False
+    ENFORCE_INPUT_PROJ: False
+    SIZE_DIVISIBILITY: 32
+    TRAIN_NUM_POINTS: 12544
+    OVERSAMPLE_RATIO: 3.0
+    IMPORTANCE_SAMPLE_RATIO: 0.75
+    DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
+    TOP_GROUNDING_LAYERS: 10
+    TOP_CAPTION_LAYERS: 10
+    TOP_SPATIAL_LAYERS: 10
+    TOP_OPENIMAGE_LAYERS: 10
+    TEST:
+      SEMANTIC_ON: False
+      INSTANCE_ON: False
+      PANOPTIC_ON: False
+      OVERLAP_THRESHOLD: 0.8
+      OBJECT_MASK_THRESHOLD: 0.8
+      SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: true
+# Spatial sampler
+STROKE_SAMPLER:
+  MAX_CANDIDATE: 1
+  CANDIDATE_PROBS: [0.25, 0.25, 0.25, 0.25] # for training only
+  CANDIDATE_NAMES: ["Point", "Polygon", "Scribble", "Circle"]
+  DILATION: 3
+  CIRCLE:
+    NUM_STROKES: 5
+    STROKE_PRESET: ['object_like', 'object_like_middle', 'object_like_small']
+    STROKE_PROB: [0.33, 0.33, 0.33]
+  SCRIBBLE:
+    NUM_STROKES: 5
+    STROKE_PRESET: ['rand_curve', 'rand_curve_small']
+    STROKE_PROB: [0.5, 0.5]
+  POINT:
+    NUM_POINTS: 20
+  POLYGON:
+    MAX_POINTS: 9
+  EVAL:
+    MODE: 'best' # best/random/best_random
+    NEGATIVE: False
+    MAX_ITER: 1
+    IOU_ITER: 1
+    GROUNDING: True
+# Multi-modal Architecture, order matters
+ATTENTION_ARCH:
+  VARIABLE:
+    queries: ['object', 'grounding', 'spatial']
+    tokens: ['grounding', 'spatial']
+    memories: ['spatial']
+  SELF_ATTENTION:
+    queries:
+      object: ['queries_object']
+      grounding: ['queries_grounding', 'tokens_grounding']
+      spatial: ['queries_spatial', 'tokens_spatial', 'memories_spatial']
+    tokens:
+      grounding: ['queries_grounding', 'tokens_grounding']
+      spatial: ['tokens_spatial']
+    memories:
+      spatial: ['memories_spatial']
+  CROSS_ATTENTION:
+    queries:
+      object: True
+      grounding: True
+      spatial: True
+    memories:
+      spatial: True
+    tokens:
+      grounding: False
+      spatial: False
+  MASKING: ['tokens_spatial', 'tokens_grounding']
+  DUPLICATION:
+    queries:
+      grounding: 'queries_object'
+      spatial: 'queries_object'
+  SPATIAL_MEMORIES: 32
+  QUERY_NUMBER: 3
+DATASETS:
+  TRAIN: [
+  'biomed_BiomedParseData-Demo_demo'    # Add your registered training datasets here
+  ]
+  TEST:  [
+  'biomed_BiomedParseData-Demo_demo'   # Add your registered test datasets here
+  ]
+  CLASS_CONCAT: false
+  SIZE_DIVISIBILITY: 32
+  PROPOSAL_FILES_TRAIN: []
+INPUT:
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+TRAIN:
+  ASPECT_RATIO_GROUPING: true
+  BATCH_SIZE_TOTAL: 4
+  BATCH_SIZE_PER_GPU: 4
+  SHUFFLE: true
+TEST:
+  DETECTIONS_PER_IMAGE: 100
+  NAME: coco_eval
+  IOU_TYPE: ['bbox', 'segm']
+  USE_MULTISCALE: false
+  BATCH_SIZE_TOTAL: 4
+  MODEL_FILE: ''
+  AUG:
+    ENABLED: False
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: False
+  NUM_WORKERS: 8
+  LOAD_PROPOSALS: False
+  SAMPLER_TRAIN: "TrainingSampler"
+  ASPECT_RATIO_GROUPING: True
+BioMed:
+  INPUT:
+    PIXEL_MEAN: [64.284, 59.293, 59.962]
+    PIXEL_STD: [62.484, 60.865, 59.835]
+    DATASET_MAPPER_NAME: "biomed_interactive"
+    MIN_SIZE_TRAIN: 900
+    MAX_SIZE_TRAIN: 1100
+    MIN_SIZE_TRAIN_SAMPLING: 'choice'
+    MIN_SIZE_TEST: 900
+    MAX_SIZE_TEST: 1100
+    IMAGE_SIZE: 1024
+    MIN_SCALE: 0.9
+    MAX_SCALE: 1.1
+    IGNORE_VALUE: 255
+    COLOR_AUG_SSD: False
+    SIZE_DIVISIBILITY: 32
+    RANDOM_FLIP: "none"
+    RANDOM_ROTATE: False
+    MASK_FORMAT: "polygon"
+    MIN_AREA: 30
+    FORMAT: "RGB"
+    SPATIAL: True
+    CROP:
+      ENABLED: True
+  DATASET:
+    DATASET: "biomed"
+# Detectron2 training config for optimizer and lr scheduler
+SOLVER:
+  BASE_LR: 0.0001
+  STEPS: [0.88889, 0.96296]
+  MAX_ITER: 1
+  GAMMA: 0.1
+  WARMUP_FACTOR: 1.0
+  WARMUP_ITERS: 10
+  WARMUP_METHOD: "linear"
+  WEIGHT_DECAY: 0.05
+  OPTIMIZER: "ADAMW"
+  LR_SCHEDULER_NAME: "WarmupMultiStepLR"
+  LR_MULTIPLIER:
+    backbone: 0.1
+    lang_encoder: 0.1
+  FIX_PARAM:
+    backbone: True
+    lang_encoder: True
+    pixel_decoder: True
+  WEIGHT_DECAY_NORM: 0.0
+  WEIGHT_DECAY_EMBED: 0.0
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 5.0 # 0.01
+    NORM_TYPE: 2.0
+  MAX_NUM_EPOCHS: 50

configs/biomedparse_inference.yaml ADDED Viewed

	@@ -0,0 +1,196 @@

+# Define Test/Trainer/Saving
+PIPELINE: XDecoderPipeline
+TRAINER: xdecoder
+SAVE_DIR: '../../data/output/test'
+base_path: "./"
+# Resume Logistic
+RESUME: false
+WEIGHT: false
+RESUME_FROM: ''
+EVAL_AT_START: false
+# Logging and Debug
+WANDB: False
+LOG_EVERY: 100
+FIND_UNUSED_PARAMETERS: false
+# Speed up training
+FP16: false
+PORT: '36873'
+# misc
+LOADER:
+  JOINT: False
+  KEY_DATASET: 'coco'
+STANDARD_TEXT_FOR_EVAL: False
+##################
+# Task settings
+##################
+VERBOSE: true
+MODEL:
+  NAME: seem_model_demo
+  HEAD: xdecoder_head
+  DIM_PROJ: 512
+  TEXT:
+    ARCH: vlpencoder
+    NAME: transformer
+    TOKENIZER: clip
+    CONTEXT_LENGTH: 77 # 77
+    WIDTH: 512
+    HEADS: 8
+    LAYERS: 12 # 6
+    AUTOGRESSIVE: True
+  BACKBONE:
+    NAME: focal
+    PRETRAINED: ''
+    LOAD_PRETRAINED: false
+    FOCAL:
+      PRETRAIN_IMG_SIZE: 224
+      PATCH_SIZE: 4
+      EMBED_DIM: 192
+      DEPTHS: [2, 2, 18, 2]
+      FOCAL_LEVELS: [4, 4, 4, 4]
+      FOCAL_WINDOWS: [3, 3, 3, 3]
+      DROP_PATH_RATE: 0.3
+      MLP_RATIO: 4.0
+      DROP_RATE: 0.0
+      PATCH_NORM: True
+      USE_CONV_EMBED: True
+      SCALING_MODULATOR: True
+      USE_CHECKPOINT: False
+      USE_POSTLN: true
+      USE_POSTLN_IN_MODULATION: false
+      USE_LAYERSCALE: True
+      OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+      OUT_INDICES: [0, 1, 2, 3]
+  ENCODER:
+    NAME: transformer_encoder_fpn
+    IGNORE_VALUE: 255
+    NUM_CLASSES: 16
+    BINARY_CLASSES: False
+    LOSS_WEIGHT: 1.0
+    CONVS_DIM: 512
+    MASK_DIM: 512
+    NORM: "GN"
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
+    COMMON_STRIDE: 4
+    TRANSFORMER_ENC_LAYERS: 6
+  DECODER:
+    NAME: seem_demo
+    TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
+    MASK:
+      ENABLED: False
+    DETECTION: False
+    SPATIAL:
+      ENABLED: True
+      MAX_ITER: 1
+    GROUNDING:
+      ENABLED: True
+      MAX_LEN: 5
+      TEXT_WEIGHT: 2.0
+      CLASS_WEIGHT: 0.5
+    VISUAL:
+      ENABLED: False
+    AUDIO:
+      ENABLED: False
+    RETRIEVAL:
+      ENABLED: False
+    LVIS:
+      ENABLED: True
+      THRES: 0.7
+    OPENIMAGE:
+      ENABLED: False
+      NEGATIVE_SAMPLES: 5
+      GROUNDING:
+        ENABLED: False
+        MAX_LEN: 5
+    CAPTION:
+      ENABLED: False
+      PHRASE_PROB: 0.5
+      SIM_THRES: 0.95
+    DEEP_SUPERVISION: True
+    NO_OBJECT_WEIGHT: 0.1
+    GCLASS_WEIGHT: 0.4
+    GMASK_WEIGHT: 1.0
+    GDICE_WEIGHT: 1.0
+    SCLASS_WEIGHT: 0.4
+    SMASK_WEIGHT: 1.0
+    SDICE_WEIGHT: 1.0
+    OCLASS_WEIGHT: 0.4
+    OMASK_WEIGHT: 1.0
+    ODICE_WEIGHT: 1.0
+    CLASS_WEIGHT: 2.0
+    MASK_WEIGHT: 5.0
+    DICE_WEIGHT: 5.0
+    BBOX_WEIGHT: 5.0
+    GIOU_WEIGHT: 2.0
+    CAPTION_WEIGHT: 2.0
+    COST_SPATIAL:
+      CLASS_WEIGHT: 5.0
+      MASK_WEIGHT: 2.0
+      DICE_WEIGHT: 2.0
+    HIDDEN_DIM: 512
+    NUM_OBJECT_QUERIES: 101
+    NHEADS: 8
+    DROPOUT: 0.0
+    DIM_FEEDFORWARD: 2048
+    MAX_SPATIAL_LEN: [512, 512, 512, 512]
+    # ENC_LAYERS: 0
+    PRE_NORM: False
+    ENFORCE_INPUT_PROJ: False
+    SIZE_DIVISIBILITY: 32
+    TRAIN_NUM_POINTS: 12544
+    OVERSAMPLE_RATIO: 3.0
+    IMPORTANCE_SAMPLE_RATIO: 0.75
+    DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
+    TOP_GROUNDING_LAYERS: 10
+    TOP_CAPTION_LAYERS: 10
+    TOP_SPATIAL_LAYERS: 10
+    TOP_OPENIMAGE_LAYERS: 10
+    TEST:
+      SEMANTIC_ON: True
+      INSTANCE_ON: True
+      PANOPTIC_ON: True
+      OVERLAP_THRESHOLD: 0.8
+      OBJECT_MASK_THRESHOLD: 0.4
+      SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
+      DETECTIONS_PER_IMAGE: 100
+# Multi-modal Architecture, order matters
+ATTENTION_ARCH:
+  VARIABLE:
+    queries: ['object']
+    tokens: ['grounding', 'spatial', 'visual', 'audio']
+  SELF_ATTENTION:
+    queries:
+      object: ['queries_object', 'tokens_grounding', 'tokens_spatial', 'tokens_visual', 'tokens_audio']
+    tokens:
+      grounding: ['queries_object', 'tokens_grounding']
+      spatial: ['tokens_spatial']
+      visual: ['tokens_visual']
+      audio: ['queries_object', 'tokens_audio']
+  CROSS_ATTENTION:
+    queries:
+      object: True
+    tokens:
+      grounding: False
+      spatial: False
+      visual: False
+      audio: False
+  MASKING: ['tokens_spatial', 'tokens_grounding', 'tokens_visual', 'tokens_audio']
+  DUPLICATION:
+    queries:
+      grounding: 'queries_object'
+      spatial: 'queries_object'
+  SPATIAL_MEMORIES: 32
+INPUT:
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+# INPUT:
+#   PIXEL_MEAN: [64.284, 59.293, 59.962]
+#   PIXEL_STD: [62.484, 60.865, 59.835]

datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from . import registration
2	+ from .build import build_train_dataloader, build_eval_dataloader, build_evaluator

datasets/build.py ADDED Viewed

	@@ -0,0 +1,630 @@

+# --------------------------------------------------------
+# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Modified by Xueyan Zou ([email protected])
+# --------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates.
+import os
+import numpy as np
+import itertools
+import logging
+from typing import Any, Callable, Dict, List, Optional, Union
+import torch
+import torch.utils.data
+import torch.utils.data as torchdata
+import detectron2.utils.comm as comm
+from detectron2.data.build import (
+    build_batch_data_loader,
+    load_proposals_into_dataset,
+    trivial_batch_collator,
+)
+from detectron2.data import MetadataCatalog
+from detectron2.data.catalog import DatasetCatalog
+from detectron2.data.common import DatasetFromList, MapDataset
+from detectron2.data.dataset_mapper import DatasetMapper
+from detectron2.data.samplers import InferenceSampler, TrainingSampler
+from detectron2.evaluation import (
+    CityscapesInstanceEvaluator,
+    CityscapesSemSegEvaluator,
+    COCOEvaluator,
+    DatasetEvaluators,
+    LVISEvaluator,
+    verify_results,
+)
+from fvcore.common.config import CfgNode
+from .dataset_mappers import *
+from .evaluation import (InstanceSegEvaluator,
+                         ClassificationEvaluator,
+                         SemSegEvaluator,
+                         RetrievalEvaluator,
+                         #CaptioningEvaluator,
+                         COCOPanopticEvaluator,
+                         GroundingEvaluator,
+                         InteractiveEvaluator,
+)
+from modeling.utils import configurable
+from utilities.distributed import get_world_size
+class JointLoader(torchdata.IterableDataset):
+    """
+    Randomly sampple from one of the dataloaders per worker in each iteration.
+    The sampling probability is determined by the size of each dataset.
+    All examples from one worker (GPU) are from the same dataset in the iteration.
+    Mixing is achieved through multiple workers (GPUs).
+    """
+    def __init__(self, loaders, key_dataset, sample_prob, mixing_level):
+        dataset_names = []
+        for key, loader in loaders.items():
+            name = "{}".format(key.split('_')[0])
+            setattr(self, name, loader)
+            dataset_names += [name]
+        self.dataset_names = dataset_names
+        self.key_dataset = key_dataset
+        if sample_prob == 'prop':
+            self.sample_prob = [len(getattr(self, key)) for key in self.dataset_names]
+        elif sample_prob == 'equal':
+            self.sample_prob = [1 for key in self.dataset_names]
+        elif sample_prob == 'sqrt':
+            self.sample_prob = [np.sqrt(len(getattr(self, key))) for key in self.dataset_names]
+        self.sample_prob = [p/sum(self.sample_prob) for p in self.sample_prob]
+        self.mixing_level = mixing_level
+        # Not sure how expensive `len(getattr(self, name))` is. computing this once and cache.
+        # this assumes the len of the underlying data loaders do not change.
+        self._len = sum(len(getattr(self, name)) for name in self.dataset_names)
+    def __iter__(self):
+        # Reset iterators at the start of each new epoch
+        self.iterators = {name: iter(getattr(self, name)) for name in self.dataset_names}
+        self._count = 0
+        return self
+    def __next__(self):
+        while self._count < self._len:
+            # Randomly select a dataloader
+            name = np.random.choice(self.dataset_names, size=None, replace=False, p=self.sample_prob)
+            iterator = self.iterators[name]
+            try:
+                # Get next batch from the selected dataloader
+                self._count += 1
+                return next(iterator)
+            except StopIteration:
+                # If the selected dataloader is exhausted, reinitialize it
+                self.iterators[name] = iter(getattr(self, name))
+        raise StopIteration
+    def __len__(self):
+        return self._len
+def filter_images_with_only_crowd_annotations(dataset_dicts, dataset_names):
+    """
+    Filter out images with none annotations or only crowd annotations
+    (i.e., images without non-crowd annotations).
+    A common training-time preprocessing on COCO dataset.
+    Args:
+        dataset_dicts (list[dict]): annotations in Detectron2 Dataset format.
+    Returns:
+        list[dict]: the same format, but filtered.
+    """
+    num_before = len(dataset_dicts)
+    def valid(anns):
+        for ann in anns:
+            if isinstance(ann, list):
+                for instance in ann:
+                    if instance.get("iscrowd", 0) == 0:
+                        return True
+            else:
+                if ann.get("iscrowd", 0) == 0:
+                    return True
+        return False
+    dataset_dicts = [x for x in dataset_dicts if valid(x["annotations"])]
+    num_after = len(dataset_dicts)
+    logger = logging.getLogger(__name__)
+    logger.info(
+        "Removed {} images with no usable annotations. {} images left.".format(
+            num_before - num_after, num_after
+        )
+    )
+    return dataset_dicts
+def get_detection_dataset_dicts(
+    dataset_names, filter_empty=True, proposal_files=None
+):
+    """
+    Load and prepare dataset dicts for instance detection/segmentation and semantic segmentation.
+    Args:
+        dataset_names (str or list[str]): a dataset name or a list of dataset names
+        filter_empty (bool): whether to filter out images without instance annotations
+        proposal_files (list[str]): if given, a list of object proposal files
+            that match each dataset in `dataset_names`.
+    Returns:
+        list[dict]: a list of dicts following the standard dataset dict format.
+    """
+    if isinstance(dataset_names, str):
+        dataset_names = [dataset_names]
+    assert len(dataset_names)
+    dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in dataset_names]
+    for dataset_name, dicts in zip(dataset_names, dataset_dicts):
+        assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)
+    if proposal_files is not None:
+        assert len(dataset_names) == len(proposal_files)
+        # load precomputed proposals from proposal files
+        dataset_dicts = [
+            load_proposals_into_dataset(dataset_i_dicts, proposal_file)
+            for dataset_i_dicts, proposal_file in zip(dataset_dicts, proposal_files)
+        ]
+    dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts))
+    has_instances = "annotations" in dataset_dicts[0]
+    if filter_empty and has_instances:
+        dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts, dataset_names)
+    assert len(dataset_dicts), "No valid data found in {}.".format(",".join(dataset_names))
+    return dataset_dicts
+def _test_loader_from_config(cfg, dataset_name, mapper=None):
+    """
+    Uses the given `dataset_name` argument (instead of the names in cfg), because the
+    standard practice is to evaluate each test set individually (not combining them).
+    """
+    if isinstance(dataset_name, str):
+        dataset_name = [dataset_name]
+    dataset = get_detection_dataset_dicts(
+        dataset_name,
+        filter_empty=False,
+        proposal_files=None,
+    )
+    if mapper is None:
+        mapper_cfg = CfgNode({'INPUT': cfg['INPUT'], 'MODEL': cfg['MODEL'], 'DATASETS': cfg['DATASETS']})
+        mapper = DatasetMapper(mapper_cfg, False)
+    assert cfg['TEST']['BATCH_SIZE_TOTAL'] % get_world_size() == 0, "Evaluation total batchsize is not divisible by gpu number"
+    #batch_size = cfg['TEST']['BATCH_SIZE_TOTAL'] // get_world_size()
+    batch_size = 1
+    return {
+        "dataset": dataset,
+        "mapper": mapper,
+        "num_workers": cfg['DATALOADER']['NUM_WORKERS'],
+        "sampler": InferenceSampler(len(dataset)),
+        "batch_size": batch_size,
+    }
+@configurable(from_config=_test_loader_from_config)
+def build_detection_test_loader(
+    dataset: Union[List[Any], torchdata.Dataset],
+    *,
+    mapper: Callable[[Dict[str, Any]], Any],
+    sampler: Optional[torchdata.Sampler] = None,
+    batch_size: int = 1,
+    num_workers: int = 0,
+    collate_fn: Optional[Callable[[List[Any]], Any]] = None,
+) -> torchdata.DataLoader:
+    """
+    Similar to `build_detection_train_loader`, with default batch size = 1,
+    and sampler = :class:`InferenceSampler`. This sampler coordinates all workers
+    to produce the exact set of all samples.
+    Args:
+        dataset: a list of dataset dicts,
+            or a pytorch dataset (either map-style or iterable). They can be obtained
+            by using :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
+        mapper: a callable which takes a sample (dict) from dataset
+           and returns the format to be consumed by the model.
+           When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``.
+        sampler: a sampler that produces
+            indices to be applied on ``dataset``. Default to :class:`InferenceSampler`,
+            which splits the dataset across all workers. Sampler must be None
+            if `dataset` is iterable.
+        batch_size: the batch size of the data loader to be created.
+            Default to 1 image per worker since this is the standard when reporting
+            inference time in papers.
+        num_workers: number of parallel data loading workers
+        collate_fn: same as the argument of `torch.utils.data.DataLoader`.
+            Defaults to do no collation and return a list of data.
+    Returns:
+        DataLoader: a torch DataLoader, that loads the given detection
+        dataset, with test-time transformation and batching.
+    Examples:
+    ::
+        data_loader = build_detection_test_loader(
+            DatasetRegistry.get("my_test"),
+            mapper=DatasetMapper(...))
+        # or, instantiate with a CfgNode:
+        data_loader = build_detection_test_loader(cfg, "my_test")
+    """
+    if isinstance(dataset, list):
+        dataset = DatasetFromList(dataset, copy=False)
+    if mapper is not None:
+        dataset = MapDataset(dataset, mapper)
+    if isinstance(dataset, torchdata.IterableDataset):
+        assert sampler is None, "sampler must be None if dataset is IterableDataset"
+    else:
+        if sampler is None:
+            sampler = InferenceSampler(len(dataset))
+    return torchdata.DataLoader(
+        dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        drop_last=False,
+        num_workers=num_workers,
+        collate_fn=trivial_batch_collator if collate_fn is None else collate_fn,
+    )
+def _train_loader_from_config(cfg, dataset_name, mapper, *, dataset=None, sampler=None):
+    cfg_datasets = cfg['DATASETS']
+    cfg_dataloader = cfg['DATALOADER']
+    if dataset is None:
+        dataset = get_detection_dataset_dicts(
+            dataset_name,
+            filter_empty=cfg_dataloader['FILTER_EMPTY_ANNOTATIONS'],
+            proposal_files=cfg_datasets['PROPOSAL_FILES_TRAIN'] if cfg_dataloader['LOAD_PROPOSALS'] else None,
+        )
+    if mapper is None:
+        mapper = DatasetMapper(cfg, True)
+    if sampler is None:
+        sampler_name = cfg_dataloader['SAMPLER_TRAIN']
+        logger = logging.getLogger(__name__)
+        logger.info("Using training sampler {}".format(sampler_name))
+        sampler = TrainingSampler(len(dataset))
+    return {
+        "dataset": dataset,
+        "sampler": sampler,
+        "mapper": mapper,
+        "total_batch_size": cfg['TRAIN']['BATCH_SIZE_TOTAL'],
+        "aspect_ratio_grouping": cfg_dataloader['ASPECT_RATIO_GROUPING'],
+        "num_workers": cfg_dataloader['NUM_WORKERS'],
+    }
+@configurable(from_config=_train_loader_from_config)
+def build_detection_train_loader(
+    dataset, *, mapper, sampler=None, total_batch_size, aspect_ratio_grouping=True, num_workers=0
+):
+    """
+    Build a dataloader for object detection with some default features.
+    This interface is experimental.
+    Args:
+        dataset (list or torch.utils.data.Dataset): a list of dataset dicts,
+            or a map-style pytorch dataset. They can be obtained by using
+            :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
+        mapper (callable): a callable which takes a sample (dict) from dataset and
+            returns the format to be consumed by the model.
+            When using cfg, the default choice is ``DatasetMapper(cfg, is_train=True)``.
+        sampler (torch.utils.data.sampler.Sampler or None): a sampler that
+            produces indices to be applied on ``dataset``.
+            Default to :class:`TrainingSampler`, which coordinates a random shuffle
+            sequence across all workers.
+        total_batch_size (int): total batch size across all workers. Batching
+            simply puts data into a list.
+        aspect_ratio_grouping (bool): whether to group images with similar
+            aspect ratio for efficiency. When enabled, it requires each
+            element in dataset be a dict with keys "width" and "height".
+        num_workers (int): number of parallel data loading workers
+    Returns:
+        torch.utils.data.DataLoader: a dataloader. Each output from it is a
+            ``list[mapped_element]`` of length ``total_batch_size / num_workers``,
+            where ``mapped_element`` is produced by the ``mapper``.
+    """
+    if isinstance(dataset, list):
+        dataset = DatasetFromList(dataset, copy=False)
+    if mapper is not None:
+        dataset = MapDataset(dataset, mapper)
+    if sampler is None:
+        sampler = TrainingSampler(len(dataset))
+    assert isinstance(sampler, torch.utils.data.sampler.Sampler)
+    return build_batch_data_loader(
+        dataset,
+        sampler,
+        total_batch_size,
+        aspect_ratio_grouping=aspect_ratio_grouping,
+        num_workers=num_workers,
+    )
+def get_config_from_name(cfg, dataset_name):
+    # adjust config according to dataset
+    if 'refcoco' in dataset_name:
+        cfg.update(cfg['REF'])
+        return cfg
+    elif 'cocomini' in dataset_name:
+        cfg.update(cfg['DAVIS'])
+        return cfg
+    elif 'ytvos' in dataset_name:
+        cfg.update(cfg['VOS'])
+        return cfg
+    elif 'ade600' in dataset_name:
+        cfg.update(cfg['DAVIS'])
+        return cfg
+    elif 'openimage600' in dataset_name:
+        cfg.update(cfg['DAVIS'])
+        return cfg
+    elif 'ade' in dataset_name:
+        if 'ADE20K' in cfg.keys():
+            cfg.update(cfg['ADE20K'])
+        return cfg
+    elif 'imagenet' in dataset_name:
+        if 'IMAGENET' in cfg.keys():
+            cfg.update(cfg['IMAGENET'])
+        return cfg
+    elif 'vlp' in dataset_name:
+        cfg.update(cfg['VLP'])
+        return cfg
+    elif 'coco' in dataset_name:
+        if 'COCO' in cfg.keys():
+            cfg.update(cfg['COCO'])
+        return cfg
+    elif 'voc' in dataset_name:
+        cfg.update(cfg['VOC'])
+        return cfg
+    elif 'context' in dataset_name:
+        cfg.update(cfg['CONTEXT'])
+        return cfg
+    elif 'sun' in dataset_name:
+        cfg.update(cfg['SUN'])
+        return cfg
+    elif 'scan' in dataset_name:
+        cfg.update(cfg['SCAN'])
+        return cfg
+    elif 'cityscape' in dataset_name:
+        cfg.update(cfg['CITY'])
+        return cfg
+    elif 'bdd' in dataset_name:
+        cfg.update(cfg['BDD'])
+        return cfg
+    elif 'tsv' in dataset_name:
+        cfg.update(cfg['TSV'])
+        return cfg
+    elif 'phrasecut' in dataset_name:
+        cfg.update(cfg['PHRASE'])
+        return cfg
+    elif 'object365' in dataset_name:
+        cfg.update(cfg['OBJECT365'])
+        return cfg
+    elif 'openimage' in dataset_name:
+        cfg.update(cfg['OPENIMAGE'])
+        return cfg
+    elif 'lvis' in dataset_name:
+        cfg.update(cfg['LVIS'])
+        return cfg
+    elif 'seginw' in dataset_name:
+        cfg.update(cfg['SEGINW'])
+        return cfg
+    elif 'sbd' in dataset_name:
+        cfg.update(cfg['SBD'])
+        return cfg
+    elif 'davis' in dataset_name:
+        cfg.update(cfg['DAVIS'])
+        return cfg
+    elif 'med_sam' in dataset_name:
+        cfg.update(cfg['MedSAM'])
+        return cfg
+    elif 'biomed' in dataset_name:
+        cfg.update(cfg['BioMed'])
+        return cfg
+    elif 'sam' in dataset_name:
+        cfg.update(cfg['SAM'])
+        return cfg
+    else:
+        assert False, "dataset not support."
+def build_eval_dataloader(cfg, ):
+    dataloaders = []
+    for dataset_name in cfg['DATASETS']['TEST']:
+        cfg = get_config_from_name(cfg, dataset_name)
+        # adjust mapper according to dataset
+        if dataset_name == 'imagenet_val':
+            mapper = ImageNetDatasetMapper(cfg, False)
+        elif dataset_name == 'bdd10k_val_sem_seg':
+            mapper = BDDSemDatasetMapper(cfg, False)
+        elif dataset_name in ["vlp_val", "vlp_captioning_val", "vlp_val2017", "vlp_captioning_val2017"]:
+            mapper = VLPreDatasetMapper(cfg, False, dataset_name)
+        elif dataset_name in ["scannet_21_val_seg", "scannet_38_val_seg", "scannet_41_val_seg"]:
+            mapper = ScanNetSegDatasetMapper(cfg, False)
+        elif dataset_name in ["scannet_21_panoptic_val", 'bdd10k_40_panoptic_val']:
+            mapper = ScanNetPanoDatasetMapper(cfg, False)
+        elif "pascalvoc_val" in dataset_name:
+            mapper = PascalVOCSegDatasetMapperIX(cfg, False, dataset_name)
+        elif 'sun' in dataset_name:
+            mapper = SunRGBDSegDatasetMapper(cfg, False)
+        elif 'refcoco' in dataset_name:
+            mapper = RefCOCODatasetMapper(cfg, False)
+        elif 'med_sam' in dataset_name:
+            mapper = MedSAMDatasetMapper(cfg, False)
+        elif 'biomed' in dataset_name:
+            mapper = BioMedDatasetMapper(cfg, False)
+        else:
+            mapper = None
+        dataloaders += [build_detection_test_loader(cfg, dataset_name, mapper=mapper)]
+    return dataloaders
+def build_train_dataloader(cfg, ):
+    dataset_names = cfg['DATASETS']['TRAIN']
+    loaders = {}
+    for dataset_name in dataset_names:
+        cfg = get_config_from_name(cfg, dataset_name)
+        mapper_name = cfg['INPUT']['DATASET_MAPPER_NAME']
+        # Semantic segmentation dataset mapper
+        if mapper_name == "mask_former_semantic":
+            mapper = MaskFormerSemanticDatasetMapper(cfg, True)
+            loaders['coco'] = build_detection_train_loader(cfg, dataset_name=dataset_name, mapper=mapper)
+        # Panoptic segmentation dataset mapper
+        elif mapper_name == "mask_former_panoptic":
+            mapper = MaskFormerPanopticDatasetMapper(cfg, True)
+            loaders['coco'] = build_detection_train_loader(cfg, dataset_name=dataset_name, mapper=mapper)
+        # Instance segmentation dataset mapper
+        elif mapper_name == "mask_former_instance":
+            mapper = MaskFormerInstanceDatasetMapper(cfg, True)
+            loaders['coco'] = build_detection_train_loader(cfg, dataset_name=dataset_name, mapper=mapper)
+        # coco instance segmentation lsj new baseline
+        elif mapper_name == "coco_instance_lsj":
+            mapper = COCOInstanceNewBaselineDatasetMapper(cfg, True)
+            loaders['coco'] = build_detection_train_loader(cfg, dataset_name=dataset_name, mapper=mapper)
+        # coco panoptic segmentation lsj new baseline
+        elif mapper_name == "coco_panoptic_lsj":
+            mapper = COCOPanopticNewBaselineDatasetMapper(cfg, True)
+            loaders['coco'] = build_detection_train_loader(cfg, dataset_name=dataset_name, mapper=mapper)
+        elif mapper_name == "vlpretrain":
+            mapper = VLPreDatasetMapper(cfg, True, dataset_name)
+            loaders['vlp'] = build_detection_train_loader(cfg, dataset_name=dataset_name, mapper=mapper)
+        elif mapper_name == "refcoco":
+            mapper = RefCOCODatasetMapper(cfg, True)
+            loaders['ref'] = build_detection_train_loader(cfg, dataset_name=dataset_name, mapper=mapper)
+        elif mapper_name == "coco_interactive":
+            mapper = COCOPanopticInteractiveDatasetMapper(cfg, True)
+            loaders['coco'] = build_detection_train_loader(cfg, dataset_name=dataset_name, mapper=mapper)
+        elif mapper_name == "medsam_interactive":
+            mapper = MedSAMDatasetMapper(cfg, True)
+            loaders['med_sam'] = build_detection_train_loader(cfg, dataset_name=dataset_name, mapper=mapper)
+        elif mapper_name == "biomed_interactive":
+            mapper = BioMedDatasetMapper(cfg, True)
+            name_key = dataset_name.split("_")[1]
+            loaders[name_key] = build_detection_train_loader(cfg, dataset_name=dataset_name, mapper=mapper)
+        else:
+            mapper = None
+            loaders[dataset_name] = build_detection_train_loader(cfg, dataset_name=dataset_name, mapper=mapper)
+    if len(loaders) == 1 or not cfg['LOADER'].get('JOINT', False):
+        return list(loaders.values())[0]
+    else:
+        sample_prob = cfg['LOADER'].get('SAMPLE_PROB', 'prop')
+        mixing_level = cfg['LOADER'].get('MIXING_LEVEL', 1)
+        return JointLoader(loaders, key_dataset=cfg['LOADER'].get('KEY_DATASET', 'coco'), sample_prob=sample_prob, mixing_level=mixing_level)
+def build_evaluator(cfg, dataset_name, output_folder=None):
+    """
+    Create evaluator(s) for a given dataset.
+    This uses the special metadata "evaluator_type" associated with each
+    builtin dataset. For your own dataset, you can simply create an
+    evaluator manually in your script and do not have to worry about the
+    hacky if-else logic here.
+    """
+    if output_folder is None:
+        output_folder = os.path.join(cfg["SAVE_DIR"], "inference")
+    evaluator_list = []
+    evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
+    # semantic segmentation
+    if evaluator_type in ["sem_seg", "ade20k_panoptic_seg"]:
+        evaluator_list.append(
+            SemSegEvaluator(
+                dataset_name,
+                distributed=True,
+                output_dir=output_folder,
+            )
+        )
+    # instance segmentation
+    if evaluator_type == "coco":
+        evaluator_list.append(COCOEvaluator(dataset_name, output_dir=output_folder))
+    cfg_model_decoder_test = cfg["MODEL"]["DECODER"]["TEST"]
+    # panoptic segmentation
+    if evaluator_type in [
+        "coco_panoptic_seg",
+        "ade20k_panoptic_seg",
+        "cityscapes_panoptic_seg",
+        "mapillary_vistas_panoptic_seg",
+        "scannet_panoptic_seg",
+        "bdd_panoptic_pano"
+    ]:
+        if cfg_model_decoder_test["PANOPTIC_ON"]:
+            evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder))
+    # COCO
+    if (evaluator_type == "coco_panoptic_seg" and cfg_model_decoder_test["INSTANCE_ON"]) or evaluator_type == "object365_od":
+        evaluator_list.append(COCOEvaluator(dataset_name, output_dir=output_folder))
+    if (evaluator_type == "coco_panoptic_seg" and cfg_model_decoder_test["SEMANTIC_ON"]) or evaluator_type == "coco_sem_seg":
+        evaluator_list.append(SemSegEvaluator(dataset_name, distributed=True, output_dir=output_folder))
+    # Mapillary Vistas
+    if evaluator_type == "mapillary_vistas_panoptic_seg" and cfg_model_decoder_test["INSTANCE_ON"]:
+        evaluator_list.append(InstanceSegEvaluator(dataset_name, output_dir=output_folder))
+    if evaluator_type == "mapillary_vistas_panoptic_seg" and cfg_model_decoder_test["SEMANTIC_ON"]:
+        evaluator_list.append(SemSegEvaluator(dataset_name, distributed=True, output_dir=output_folder))
+    # Cityscapes
+    if evaluator_type == "cityscapes_instance":
+        assert (
+            torch.cuda.device_count() > comm.get_rank()
+        ), "CityscapesEvaluator currently do not work with multiple machines."
+        return CityscapesInstanceEvaluator(dataset_name)
+    if evaluator_type == "cityscapes_sem_seg":
+        assert (
+            torch.cuda.device_count() > comm.get_rank()
+        ), "CityscapesEvaluator currently do not work with multiple machines."
+        return CityscapesSemSegEvaluator(dataset_name)
+    if evaluator_type == "cityscapes_panoptic_seg":
+        if cfg_model_decoder_test["SEMANTIC_ON"]:
+            assert (
+                torch.cuda.device_count() > comm.get_rank()
+            ), "CityscapesEvaluator currently do not work with multiple machines."
+            evaluator_list.append(CityscapesSemSegEvaluator(dataset_name))
+        if cfg_model_decoder_test["INSTANCE_ON"]:
+            assert (
+                torch.cuda.device_count() > comm.get_rank()
+            ), "CityscapesEvaluator currently do not work with multiple machines."
+            evaluator_list.append(CityscapesInstanceEvaluator(dataset_name))
+    # ADE20K
+    if evaluator_type == "ade20k_panoptic_seg" and cfg_model_decoder_test["INSTANCE_ON"]:
+        evaluator_list.append(InstanceSegEvaluator(dataset_name, output_dir=output_folder))
+    # SEGINW
+    if evaluator_type == "seginw" and cfg_model_decoder_test["INSTANCE_ON"]:
+        evaluator_list.append(InstanceSegEvaluator(dataset_name, output_dir=output_folder))
+    # LVIS
+    if evaluator_type == "lvis":
+        return LVISEvaluator(dataset_name, output_dir=output_folder)
+    # Classification
+    if evaluator_type == "classification":
+        evaluator_list.append(ClassificationEvaluator(dataset_name, output_folder))
+    # Retrieval
+    if evaluator_type in ["retrieval"]:
+        evaluator_list.append(RetrievalEvaluator(dataset_name, output_folder, cfg['MODEL']['DECODER']['RETRIEVAL']['ENSEMBLE']))
+    if evaluator_type == "captioning":
+        evaluator_list.append(CaptioningEvaluator(dataset_name, output_folder, MetadataCatalog.get(dataset_name).gt_json))
+    if evaluator_type in ["grounding_refcoco", "grounding_phrasecut", "grounding_spatial", "grounding_entity"]:
+        evaluator_list.append(GroundingEvaluator(dataset_name))
+    # Interactive
+    if evaluator_type in ["interactive", "interactive_grounding"]:
+        evaluator_list.append(InteractiveEvaluator(dataset_name, output_dir=output_folder, max_clicks=cfg['STROKE_SAMPLER']['EVAL']['MAX_ITER'], iou_iter=cfg['STROKE_SAMPLER']['EVAL']['IOU_ITER']))
+    if len(evaluator_list) == 0:
+        raise NotImplementedError(
+            "no Evaluator for the dataset {} with the type {}".format(
+                dataset_name, evaluator_type
+            )
+        )
+    elif len(evaluator_list) == 1:
+        return evaluator_list[0]
+    return DatasetEvaluators(evaluator_list)

datasets/dataset_mappers/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .biomed_dataset_mapper import BioMedDatasetMapper

datasets/dataset_mappers/biomed_dataset_mapper.py ADDED Viewed

	@@ -0,0 +1,378 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
+import copy
+import logging
+import random
+import numpy as np
+import torch
+from transformers import AutoTokenizer, LlamaForCausalLM
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+from detectron2.data.transforms import TransformGen
+from detectron2.structures import BitMasks, Boxes, Instances, BoxMode
+from detectron2.structures.boxes import pairwise_iou
+from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
+from detectron2.data import MetadataCatalog
+from pycocotools import mask as coco_mask
+from utilities import prompt_engineering
+from modeling.language import build_tokenizer
+from modeling.language.misc import text_noun_with_prompt_all
+from modeling.utils import configurable
+from ..visual_sampler.sampler import build_shape_sampler
+__all__ = ["BioMedDatasetMapper"]
+def build_transform_gen(cfg, is_train):
+    """
+    Create a list of default :class:`Augmentation` from config.
+    Now it includes resizing and flipping.
+    Returns:
+        list[Augmentation]
+    """
+    assert is_train, "Only support training augmentation"
+    cfg_input = cfg['INPUT']
+    image_size = cfg_input['IMAGE_SIZE']
+    min_scale = cfg_input['MIN_SCALE']
+    max_scale = cfg_input['MAX_SCALE']
+    augmentation = []
+    if cfg_input['RANDOM_FLIP'] != "none":
+        augmentation.append(
+            T.RandomFlip(
+                horizontal=cfg_input['RANDOM_FLIP'] == "horizontal",
+                vertical=cfg_input['RANDOM_FLIP'] == "vertical",
+            )
+        )
+    augmentation.extend([
+        T.ResizeScale(
+            min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
+        ),
+        T.FixedSizeCrop(crop_size=(image_size, image_size)),
+    ])
+    return augmentation
+def build_transform_gen_se(cfg, is_train):
+    # min_scale = cfg['INPUT']['MIN_SIZE_TEST']
+    # max_scale = cfg['INPUT']['MAX_SIZE_TEST']
+    augmentation = []
+    # augmentation.extend([
+    #     T.ResizeShortestEdge(
+    #         min_scale, max_size=max_scale
+    #     ),
+    # ])
+    return augmentation
+def convert_coco_poly_to_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+# This is specifically designed for the COCO dataset.
+class BioMedDatasetMapper:
+    """
+    A callable which takes a dataset dict in Detectron2 Dataset format,
+    and map it into a format used by MaskFormer.
+    This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
+    The callable currently does the following:
+    1. Read the image from "file_name"
+    2. Applies geometric transforms to the image and annotation
+    3. Find and applies suitable cropping to the image and annotation
+    4. Prepare image and annotation to Tensors
+    """
+    @configurable
+    def __init__(
+        self,
+        is_train=True,
+        *,
+        tfm_gens,
+        image_format,
+        caption_thres,
+        grounding,
+        lvis,
+        lvis_thres,
+        max_grounding_num,
+        shape_sampler,
+        retrieval,
+        max_token_num,
+        tokenizer,
+        binary_classes: bool,
+        rotate: bool,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            is_train: for training or inference
+            augmentations: a list of augmentations or deterministic transforms to apply
+            crop_gen: crop augmentation
+            tfm_gens: data augmentation
+            image_format: an image format supported by :func:`detection_utils.read_image`.
+        """
+        self.tfm_gens = tfm_gens
+        logging.getLogger(__name__).info(
+            "[COCOPanopticNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(
+                str(self.tfm_gens)
+            )
+        )
+        self.img_format = image_format
+        self.is_train = is_train
+        self.caption_thres = caption_thres
+        self.grounding = grounding
+        self.lvis = lvis
+        self.lvis_thres = lvis_thres
+        self.max_grounding_num = max_grounding_num
+        self.shape_sampler = shape_sampler
+        self.retrieval = retrieval
+        self.tokenizer = tokenizer
+        self.max_token_num = max_token_num
+        self.binary_classes = binary_classes
+        self.rotate = rotate
+    @classmethod
+    def from_config(cls, cfg, is_train=True):
+        # Build augmentation
+        if is_train:
+            tfm_gens = build_transform_gen(cfg, is_train)
+        else:
+            tfm_gens = build_transform_gen_se(cfg, is_train)
+        shape_sampler = build_shape_sampler(cfg)
+        retrieval = cfg['MODEL']['DECODER']['RETRIEVAL']['ENABLED']
+        tokenizer, max_token_num = None, None
+        if retrieval:
+            lang_model = cfg['MODEL']['TEXT']['NAME']
+            max_token_num = cfg['MODEL']['TEXT']['CONTEXT_LENGTH']
+            if 'llama' in lang_model:
+                tokenizer = AutoTokenizer.from_pretrained(lang_model, padding_side='right')
+                tokenizer.pad_token = tokenizer.eos_token
+            else:
+                tokenizer = build_tokenizer(cfg['MODEL']['TEXT'])
+        ret = {
+            "is_train": is_train,
+            "tfm_gens": tfm_gens,
+            "image_format": cfg['INPUT']['FORMAT'],
+            "caption_thres": cfg['MODEL']['DECODER']['CAPTION']['SIM_THRES'],
+            "grounding": cfg['MODEL']['DECODER']['GROUNDING']['ENABLED'],
+            "lvis": cfg['MODEL']['DECODER']['LVIS']['ENABLED'],
+            "lvis_thres": cfg['MODEL']['DECODER']['LVIS']['THRES'],
+            "max_grounding_num": cfg['MODEL']['DECODER']['GROUNDING']['MAX_LEN'],
+            "shape_sampler": shape_sampler,
+            "retrieval": retrieval,
+            "max_token_num": max_token_num,
+            "tokenizer": tokenizer,
+            "binary_classes": cfg['MODEL']['ENCODER']['BINARY_CLASSES'],
+            "rotate": cfg['INPUT']['RANDOM_ROTATE'],
+        }
+        return ret
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        while True:
+            try:
+                image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+                break
+            except:
+                print('Image loading error:', dataset_dict["file_name"])
+        utils.check_image_size(dataset_dict, image)
+        image, transforms = T.apply_transform_gens(self.tfm_gens, image)
+        image_shape = image.shape[:2]  # h, w
+        rotate_time = 0
+        if self.is_train and self.rotate and random.random() < 0.5:
+            rotate_time = random.randint(1, 3)
+        if rotate_time > 0:
+            image = np.rot90(image, rotate_time)
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
+        grounding_anno = dataset_dict['grounding_info']
+        if len(grounding_anno) == 0:
+            print(dataset_dict['file_name'])
+        assert len(grounding_anno) > 0
+        masks_grd = []
+        texts_grd = []
+        boxes_grd = []
+        hash_grd = []
+        classes = []
+        masks_orig = []
+        for ann in grounding_anno:
+            if 'segmentation' in ann:
+                if len(ann['segmentation']) == 0:
+                    print('Empty segmentation!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
+                    continue
+                rle = coco_mask.frPyObjects(
+                    ann['segmentation'], dataset_dict['height'], dataset_dict['width'])
+                m = coco_mask.decode(rle)
+                masks_orig.append(m)
+                # sometimes there are multiple binary map (corresponding to multiple segs)
+                m = np.sum(m, axis=2)
+            else:
+                # directly read from mask file
+                while True:
+                    try:
+                        m = utils.read_image(ann["mask_file"], format=self.img_format)
+                        break
+                    except:
+                        print('Image loading error:', ann["mask_file"])
+                m = np.sum(m, axis=2)
+                m = 1 * (m > 0)
+            m = m.astype(np.uint8)  # convert to np.uint8
+            m = transforms.apply_segmentation(255*m[:,:,None])[:,:,0]
+            if rotate_time > 0:
+                m = np.rot90(m, rotate_time)
+            masks_grd += [m]
+            rand_id = random.randint(0, len(ann['sentences'])-1)
+            texts_grd.append(ann['sentences'][rand_id]['raw'].lower())
+            hash_grd.append(hash(ann['sentences'][rand_id]['raw'].lower()))
+            if self.binary_classes:
+                ann["category_id"] = 1 * (ann["category_id"] > 0)
+            classes.append(ann["category_id"])
+        #masks_grd = torch.from_numpy(np.stack(masks_grd))
+        boxes_grd = torch.tensor(boxes_grd)
+        groundings = {'masks': masks_grd, 'texts': texts_grd, 'hash': hash_grd, 'mode': 'text'}
+        dataset_dict["groundings"] = groundings
+        masks_grd = torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks_grd])
+        instances = Instances(image_shape)
+        instances.gt_masks = BitMasks(masks_grd)
+        instances.gt_boxes = BitMasks(masks_grd).get_bounding_boxes()
+        classes = np.array(classes)
+        is_things = np.array([1 for _ in classes])
+        instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
+        instances.is_things = torch.tensor(is_things, dtype=torch.int64)
+        dataset_dict["instances"] = instances
+        spatial_query_utils = self.shape_sampler(instances)
+        dataset_dict['spatial_query'] = spatial_query_utils
+        if self.retrieval:
+            captions = dataset_dict['captions']
+            tokens = self.tokenizer(
+                captions, padding='max_length', truncation=True, max_length=self.max_token_num, return_tensors='pt'
+            )
+            dataset_dict['tokens'] = {"input_ids": tokens["input_ids"], "attention_mask": tokens["attention_mask"]}
+        if self.grounding:
+            grounding_anno = dataset_dict['grounding_info']
+            grounding_len = random.randint(1, self.max_grounding_num-1)
+            if len(grounding_anno) > 0:
+                masks_grd = []
+                texts_grd = []
+                mode = 'text'
+                random.shuffle(grounding_anno)
+                for ann in grounding_anno:
+                    if 'segmentation' in ann:
+                        if len(ann['segmentation']) == 0:
+                            print('Empty segmentation!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
+                            continue
+                        rle = coco_mask.frPyObjects(
+                            ann['segmentation'], dataset_dict['height'], dataset_dict['width'])
+                        m = coco_mask.decode(rle)
+                        # sometimes there are multiple binary map (corresponding to multiple segs)
+                        m = np.sum(m, axis=2)
+                    else:
+                        # directly read from mask file
+                        while True:
+                            try:
+                                m = utils.read_image(ann["mask_file"], format=self.img_format)
+                                break
+                            except:
+                                print('Image loading error:', ann["mask_file"])
+                        m = np.sum(m, axis=2)
+                        m = 1 * (m > 0)
+                    m = m.astype(np.uint8)  # convert to np.uint8
+                    m = transforms.apply_segmentation(m[:,:,None])[:,:,0]
+                    if rotate_time > 0:
+                        m = np.rot90(m, rotate_time)
+                    masks_grd += [m]
+                    # random select a sentence of a single annotation.
+                    rand_index = random.randint(0, len(ann['sentences'])-1)
+                    texts_grd += [ann['sentences'][rand_index]['raw'].lower()]
+                # max_len = min(grounding_len, len(texts_grd))
+                max_len = len(masks_grd)
+                indices = np.random.permutation(max_len)
+                texts_grd = list(np.array(texts_grd)[indices])
+                masks_grd = torch.tensor(np.stack(masks_grd)[indices])
+                hash_grd = np.array([hash(txt) for txt in texts_grd])
+            else:
+                masks_grd = instances.gt_masks.tensor
+                mode = 'class'
+                if len(masks_grd) == 0:
+                    masks_grd = torch.tensor([])
+                    texts_grd = ['none']
+                    hash_grd = np.array([hash(txt) for txt in texts_grd])
+                else:
+                    biomed_classes = ['liver', 'lung', 'kidney', 'pancreas', 'heart anatomies', 'brain anatomies',
+                                      'eye anatomies', 'vessel', 'other organ', 'tumor', 'infection', 'other lesion',
+                                      'fluid disturbance', 'other abnormality', 'histology structure', 'other']
+                    if self.binary_classes:
+                        biomed_classes = ['target']
+                    texts_grd = np.array(biomed_classes)
+                    hash_grd = np.array([hash(txt) for txt in texts_grd])
+                    unique_hash_grd = np.unique(hash_grd)
+                    np.random.shuffle(unique_hash_grd)
+                    max_len = min(grounding_len, len(unique_hash_grd))
+                    indices = np.random.permutation(max_len)
+                    selected_unique_hash_grd = unique_hash_grd[indices]
+                    selected_mask = np.in1d(hash_grd, selected_unique_hash_grd)
+                    texts_grd = texts_grd[selected_mask]
+                    hash_grd = hash_grd[selected_mask]
+                    masks_grd = masks_grd[selected_mask]
+                    texts_grd = [prompt_engineering(text.replace('-other','').replace('-merged','').replace('-stuff',''), topk=10000, suffix='.') \
+                                        for text in texts_grd]
+            groundings = {'masks': masks_grd, 'texts': texts_grd, 'mode': mode, 'hash': hash_grd}
+            dataset_dict["groundings"] = groundings
+            assert len(masks_grd) == len(dataset_dict['grounding_info']), f"len(masks_grd)={len(masks_grd)}, len(dataset_dict['grounding_info'])={len(dataset_dict['grounding_info'])}, mask shape={masks_grd.shape}, max_len={max_len}, grounding_len={grounding_len}, len(texts_grd)={len(texts_grd)}, len(hash_grd)={len(hash_grd)}"
+        # gt_masks_orisize = torch.stack([torch.from_numpy(m.squeeze(-1)) for m in masks_orig])
+        # dataset_dict['gt_masks_orisize'] = gt_masks_orisize # (nm,h,w)
+        return dataset_dict

datasets/evaluation/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from .instance_evaluation import *
+from .classification_evaluation import *
+from .segmentation_evaluation import *
+from .retrieval_evaluation import *
+#from .captioning_evaluation import *
+from .panoptic_evaluation import *
+from .grounding_evaluation import *
+from .interactive_evaluation import *

datasets/evaluation/captioning_evaluation.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# --------------------------------------------------------
+# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Modified by Xueyan Zou ([email protected])
+# --------------------------------------------------------
+import os
+import json
+import logging
+import itertools
+import detectron2.utils.comm as comm
+from detectron2.evaluation.evaluator import DatasetEvaluator
+from caption_pycocotools.coco import COCO
+from pycocoevalcap.eval import COCOEvalCap
+class CaptioningEvaluator(DatasetEvaluator):
+    """
+    Evaluate AR for object proposals, AP for instance detection/segmentation, AP
+    for keypoint detection outputs using COCO's metrics.
+    See http://cocodataset.org/#detection-eval and
+    http://cocodataset.org/#keypoints-eval to understand its metrics.
+    The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means
+    the metric cannot be computed (e.g. due to no predictions made).
+    In addition to COCO, this evaluator is able to support any bounding box detection,
+    instance segmentation, or keypoint detection dataset.
+    """
+    def __init__(
+        self,
+        distributed=True,
+        output_dir=None,
+        gt_json=None,
+    ):
+        """
+        Args:
+            dataset_name (str): name of the dataset to be evaluated.
+                It must have either the following corresponding metadata:
+                    "json_file": the path to the COCO format annotation
+                Or it must be in detectron2's standard dataset format
+                so it can be converted to COCO format automatically.
+            tasks (tuple[str]): tasks that can be evaluated under the given
+                configuration. A task is one of "bbox", "segm", "keypoints".
+                By default, will infer this automatically from predictions.
+            distributed (True): if True, will collect results from all ranks and run evaluation
+                in the main process.
+                Otherwise, will only evaluate the results in the current process.
+            output_dir (str): optional, an output directory to dump all
+                results predicted on the dataset. The dump contains two files:
+                1. "instances_predictions.pth" a file that can be loaded with `torch.load` and
+                   contains all the results in the format they are produced by the model.
+                2. "coco_instances_results.json" a json file in COCO's result format.
+            max_dets_per_image (int): limit on the maximum number of detections per image.
+                By default in COCO, this limit is to 100, but this can be customized
+                to be greater, as is needed in evaluation metrics AP fixed and AP pool
+                (see https://arxiv.org/pdf/2102.01066.pdf)
+                This doesn't affect keypoint evaluation.
+            use_fast_impl (bool): use a fast but **unofficial** implementation to compute AP.
+                Although the results should be very close to the official implementation in COCO
+                API, it is still recommended to compute results with the official API for use in
+                papers. The faster implementation also uses more RAM.
+            kpt_oks_sigmas (list[float]): The sigmas used to calculate keypoint OKS.
+                See http://cocodataset.org/#keypoints-eval
+                When empty, it will use the defaults in COCO.
+                Otherwise it should be the same length as ROI_KEYPOINT_HEAD.NUM_KEYPOINTS.
+            allow_cached_coco (bool): Whether to use cached coco json from previous validation
+                runs. You should set this to False if you need to use different validation data.
+                Defaults to True.
+        """
+        self._logger = logging.getLogger(__name__)
+        self._distributed = distributed
+        self._output_dir = output_dir
+        self._gt_json = COCO(gt_json)
+    def reset(self):
+        self._gen_captions = []
+        self._image_ids = []
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
+                It is a list of dict. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name", "image_id".
+            outputs: the outputs of a COCO model. It is a list of dicts with key
+                "instances" that contains :class:`Instances`.
+        """
+        for output in outputs:
+            self._image_ids.append(output['image_id'])
+            self._gen_captions.append(output['captioning_text'])
+    def evaluate(self, img_ids=None):
+        """
+        Args:
+            img_ids: a list of image IDs to evaluate on. Default to None for the whole dataset
+        """
+        if self._distributed:
+            comm.synchronize()
+            def gather(x, move=False):
+                x = comm.gather(x)
+                x = list(itertools.chain(*x))
+                if move:
+                    x = [xx.to(self._gen_captions[0].device) for xx in x]
+                return x
+            gen_captions = gather(self._gen_captions)
+            image_ids = gather(self._image_ids)
+            if not comm.is_main_process():
+                return {}
+        else:
+            gen_captions = self._gen_captions
+            image_ids = self._image_ids
+        assert len(gen_captions) == len(image_ids)
+        pred_captions = [{"image_id": image_id, "caption": gen_caption} for image_id, gen_caption in zip(image_ids, gen_captions)]
+        pred_pth = os.path.join(self._output_dir, 'results.json')
+        json.dump(pred_captions, open(pred_pth, "w"))
+        gt_captions = self._gt_json
+        pred_captions = gt_captions.loadRes(pred_pth)
+        cocoEval = COCOEvalCap(gt_captions, pred_captions)
+        cocoEval.params['image_id'] = pred_captions.getImgIds()
+        cocoEval.evaluate()
+        return cocoEval.eval

datasets/evaluation/classification_evaluation.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# --------------------------------------------------------
+# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Modified by Xueyan Zou ([email protected])
+# --------------------------------------------------------
+import torch
+import logging
+from detectron2.evaluation.evaluator import DatasetEvaluator
+from utilities.misc import AverageMeter
+from utilities.distributed import get_world_size
+@torch.no_grad()
+def accuracy(output, target, topk=(1,)):
+    """Computes the precision@k for the specified values of k"""
+    if isinstance(output, list):
+        output = output[-1]
+    n_classes = output.size()[1]
+    maxk = min(max(topk), n_classes)
+    batch_size = target.size(0)
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.reshape(1, -1).expand_as(pred))
+    res = []
+    for k in topk:
+        correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
+        res.append(correct_k.mul_(100.0 / batch_size).item())
+    return res
+class ClassificationEvaluator(DatasetEvaluator):
+    def __init__(self, *args):
+        self.top1 = AverageMeter()
+        self.top5 = AverageMeter()
+        self._logger = logging.getLogger(__name__)
+    def reset(self):
+        self.top1.reset()
+        self.top5.reset()
+    def process(self, inputs, outputs):
+        logits = torch.stack([o['pred_class'] for o in outputs])
+        y = torch.tensor([t['class_id'] for t in inputs], device=logits.device)
+        prec1, prec5 = accuracy(logits, y, (1, 5))
+        self.top1.update(prec1, y.size(0))
+        self.top5.update(prec5, y.size(0))
+    def evaluate(self):
+        if get_world_size() > 1:
+            tmp_tensor = torch.tensor(
+                [self.top1.sum, self.top5.sum, self.top1.count],
+                device=torch.cuda.current_device()
+            )
+            torch.distributed.all_reduce(
+                tmp_tensor, torch.distributed.ReduceOp.SUM
+            )
+            top1_sum, top5_sum, count = tmp_tensor.tolist()
+        else:
+            top1_sum = self.top1.sum
+            top5_sum = self.top5.sum
+            count = self.top1.count
+        results = {}
+        scores = {
+            'top1': top1_sum / count,
+            "top5": top5_sum / count
+        }
+        results['class'] = scores
+        self._logger.info(results)
+        return results

datasets/evaluation/grounding_evaluation.py ADDED Viewed

	@@ -0,0 +1,173 @@

+# --------------------------------------------------------
+# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Modified by Xueyan Zou ([email protected])
+# --------------------------------------------------------
+import logging
+import torch
+from torchvision.ops import box_iou
+from detectron2.structures import BoxMode
+from detectron2.data import MetadataCatalog
+from detectron2.utils.comm import all_gather, is_main_process, synchronize
+from detectron2.evaluation.evaluator import DatasetEvaluator
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import copy
+class GroundingEvaluator(DatasetEvaluator):
+    """
+    Evaluate grounding segmentation metrics.
+    """
+    def __init__(
+        self,
+        dataset_name,
+        compute_box=False,
+        distributed=True,
+    ):
+        self._logger = logging.getLogger(__name__)
+        self._dataset_name = dataset_name
+        self._distributed = distributed
+        self._cpu_device = torch.device("cpu")
+        self._compute_box = compute_box
+        meta = MetadataCatalog.get(dataset_name)
+    def reset(self):
+        self.cum_I = 0
+        self.cum_U = 0
+        self.mIoU = 0
+        self.mDice = 0
+        self.cum_mean_area = 0
+        self.eval_seg_iou_list = [.5, .6, .7, .8, .9]
+        self.seg_correct = torch.zeros(len(self.eval_seg_iou_list), device=self._cpu_device)
+        self.seg_total = 0
+        self.instance_results = []
+        if self._compute_box:
+            self.mIoU_box = 0
+            self.seg_correct_box = torch.zeros(len(self.eval_seg_iou_list), device=self._cpu_device)
+    @staticmethod
+    def computeIoU(pred_seg, gd_seg):
+        I = (pred_seg & gd_seg)
+        U = (pred_seg | gd_seg)
+        return I, U
+    def get_metadata(self, _input):
+        """
+        Extracts and returns specific metadata from the input dictionary.
+        Parameters:
+        _input (dict): A dictionary containing keys like 'file_name', 'image_id', and 'grounding_info'.
+                    The 'grounding_info' is a list of dictionaries with keys like 'area', 'iscrowd', etc.
+        Returns:
+        dict: A dictionary containing filtered metadata.
+        """
+        _input = copy.deepcopy(_input)
+        selected_input_keys = ['file_name', 'image_id', 'grounding_info']
+        selected_grounding_info_keys = ['area', 'mask_file', 'iscrowd', 'image_id', 'category_id', 'id', 'file_name', 'split', 'ann_id', 'ref_id']
+        filtered_input = {key: _input[key] for key in selected_input_keys if key in _input}
+        # Check if grounding_info is present and is a list
+        if 'grounding_info' in filtered_input and isinstance(filtered_input['grounding_info'], list):
+            # Filter each grounding_info dictionary
+            filtered_input['grounding_info'] = [
+                {key: info[key] for key in selected_grounding_info_keys if key in info}
+                for info in filtered_input['grounding_info']
+            ]
+        return filtered_input
+    def process(self, inputs, outputs):
+        for input, output in zip(inputs, outputs):
+            pred = output['grounding_mask'].sigmoid() > 0.5
+            # # save pixel probability
+            # prob = output['grounding_mask'].sigmoid().cpu().numpy()[0] * 255
+            # pred_file = input['file_name'].split('.')[0].replace('test/', 'test_pred/') + '_' + input['groundings']['texts'][0].replace(' ', '+') + '.png'
+            # if not os.path.exists('/'.join(pred_file.split('/')[:-1])):
+            #     os.makedirs('/'.join(pred_file.split('/')[:-1]), exist_ok=True)
+            # plt.imsave(pred_file,
+            #            prob.astype(np.uint8), cmap='gray')
+            gt = input['groundings']['masks'].bool()
+            bsi = len(pred)
+            I, U = self.computeIoU(pred, gt)
+            self.cum_I += I.sum().cpu()
+            self.cum_U += U.sum().cpu()
+            IoU = I.reshape(bsi,-1).sum(-1)*1.0 / (U.reshape(bsi,-1).sum(-1) + 1e-6)
+            self.mIoU += IoU.sum().cpu()
+            # Add Dice score in eval
+            Dice = I.reshape(bsi,-1).sum(-1)*2.0 / (gt.reshape(bsi,-1).sum(-1) + pred.reshape(bsi,-1).sum(-1) + 1e-6)
+            self.mDice += Dice.sum().cpu()
+            self.cum_mean_area += ((gt.reshape(bsi,-1).sum(-1) + pred.reshape(bsi,-1).sum(-1)) / 2.0).sum().cpu()
+            if self._compute_box:
+                pred_box = BoxMode.convert(output['grounding_box'], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS)
+                gt_box = BoxMode.convert(input['groundings']['boxes'], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS).cpu()
+                IoU_box = box_iou(pred_box, gt_box).diagonal()
+                self.mIoU_box += IoU_box.sum()
+            for idx in range(len(self.eval_seg_iou_list)):
+                eval_seg_iou = self.eval_seg_iou_list[idx]
+                self.seg_correct[idx] += (IoU >= eval_seg_iou).sum().cpu()
+                if self._compute_box:
+                    self.seg_correct_box[idx] += (IoU_box >= eval_seg_iou).sum().cpu()
+            self.seg_total += bsi
+            instance_result = {
+                'metadata': self.get_metadata(input),
+                'IoU': IoU.cpu().numpy().tolist(),
+                'Dice': Dice.cpu().numpy().tolist(),
+                'I': I.sum(dim=(1, 2)).cpu().numpy().tolist(),
+                'U': U.sum(dim=(1, 2)).cpu().numpy().tolist(),
+                'IoU_box': IoU_box.cpu().numpy().tolist() if self._compute_box else '',
+                'pred_area': pred.reshape(bsi,-1).sum(-1).cpu().numpy().tolist(),
+            }
+            iou_len = IoU.shape[0]
+            grounding_info_len = len(self.get_metadata(input)['grounding_info'])
+            assert iou_len == grounding_info_len, f'Number of IoU scores ({iou_len}) and grounding info ({grounding_info_len}) do not match.'
+            self.instance_results.append(instance_result)
+    def evaluate(self):
+        if self._distributed:
+            synchronize()
+            self.cum_I = torch.stack(all_gather(self.cum_I)).sum()
+            self.cum_U = torch.stack(all_gather(self.cum_U)).sum()
+            self.mIoU = torch.stack(all_gather(self.mIoU)).sum()
+            self.mDice = torch.stack(all_gather(self.mDice)).sum()
+            self.cum_mean_area = torch.stack(all_gather(self.cum_mean_area)).sum()
+            self.seg_correct = torch.stack(all_gather(self.seg_correct)).sum(0)
+            self.seg_total = sum(all_gather(self.seg_total))
+            self.instance_results = sum(all_gather(self.instance_results), [])
+            if self._compute_box:
+                self.mIoU_box = torch.stack(all_gather(self.mIoU_box)).sum()
+                self.seg_correct_box = torch.stack(all_gather(self.seg_correct_box)).sum(0)
+            if not is_main_process():
+                return
+        results = {}
+        for idx in range(len(self.eval_seg_iou_list)):
+            result_str = 'precision@{}'.format(self.eval_seg_iou_list[idx])
+            results[result_str] = (self.seg_correct[idx]*100 / self.seg_total).item()
+        results['cIoU'] = (self.cum_I*100./self.cum_U).item()
+        results['mIoU'] = (self.mIoU*100./self.seg_total).item()
+        results['cDice'] = (self.cum_I*100./self.cum_mean_area).item()
+        results['mDice'] = (self.mDice*100./self.seg_total).item()
+        if self._compute_box:
+            for idx in range(len(self.eval_seg_iou_list)):
+                result_str = 'precisionB@{}'.format(self.eval_seg_iou_list[idx])
+                results[result_str] = (self.seg_correct_box[idx]*100 / self.seg_total).item()
+            results['mBIoU'] = (self.mIoU_box*100./self.seg_total).item()
+        self._logger.info(results)
+        return {'grounding': {'scores': results, 'instance_results': self.instance_results}}

datasets/evaluation/instance_evaluation.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import contextlib
+import copy
+import io
+import itertools
+import json
+import logging
+import numpy as np
+import os
+import pickle
+from collections import OrderedDict
+import pycocotools.mask as mask_util
+import torch
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from tabulate import tabulate
+import detectron2.utils.comm as comm
+from detectron2.config import CfgNode
+from detectron2.data import MetadataCatalog
+from detectron2.data.datasets.coco import convert_to_coco_json
+from detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco
+from detectron2.evaluation.fast_eval_api import COCOeval_opt
+from detectron2.structures import Boxes, BoxMode, pairwise_iou
+from detectron2.utils.file_io import PathManager
+from detectron2.utils.logger import create_small_table
+# modified from COCOEvaluator for instance segmetnat
+class InstanceSegEvaluator(COCOEvaluator):
+    """
+    Evaluate AR for object proposals, AP for instance detection/segmentation, AP
+    for keypoint detection outputs using COCO's metrics.
+    See http://cocodataset.org/#detection-eval and
+    http://cocodataset.org/#keypoints-eval to understand its metrics.
+    The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means
+    the metric cannot be computed (e.g. due to no predictions made).
+    In addition to COCO, this evaluator is able to support any bounding box detection,
+    instance segmentation, or keypoint detection dataset.
+    """
+    def _eval_predictions(self, predictions, img_ids=None):
+        """
+        Evaluate predictions. Fill self._results with the metrics of the tasks.
+        """
+        self._logger.info("Preparing results for COCO format ...")
+        coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
+        tasks = self._tasks or self._tasks_from_predictions(coco_results)
+        # unmap the category ids for COCO
+        if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
+            dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id
+            # all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
+            # num_classes = len(all_contiguous_ids)
+            # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1
+            reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
+            for result in coco_results:
+                category_id = result["category_id"]
+                # assert category_id < num_classes, (
+                #     f"A prediction has class={category_id}, "
+                #     f"but the dataset only has {num_classes} classes and "
+                #     f"predicted class id should be in [0, {num_classes - 1}]."
+                # )
+                assert category_id in reverse_id_mapping, (
+                    f"A prediction has class={category_id}, "
+                    f"but the dataset only has class ids in {dataset_id_to_contiguous_id}."
+                )
+                result["category_id"] = reverse_id_mapping[category_id]
+        if self._output_dir:
+            file_path = os.path.join(self._output_dir, "coco_instances_results.json")
+            self._logger.info("Saving results to {}".format(file_path))
+            with PathManager.open(file_path, "w") as f:
+                f.write(json.dumps(coco_results))
+                f.flush()
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+        self._logger.info(
+            "Evaluating predictions with {} COCO API...".format(
+                "unofficial" if self._use_fast_impl else "official"
+            )
+        )
+        for task in sorted(tasks):
+            assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!"
+            coco_eval = (
+                _evaluate_predictions_on_coco(
+                    self._coco_api,
+                    coco_results,
+                    task,
+                    kpt_oks_sigmas=self._kpt_oks_sigmas,
+                    use_fast_impl=self._use_fast_impl,
+                    img_ids=img_ids,
+                    max_dets_per_image=self._max_dets_per_image,
+                )
+                if len(coco_results) > 0
+                else None  # cocoapi does not handle empty results very well
+            )
+            res = self._derive_coco_results(
+                coco_eval, task, class_names=self._metadata.get("thing_classes")
+            )
+            self._results[task] = res

datasets/evaluation/interactive_evaluation.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import os
+import numpy as np
+import torch
+from torchvision.ops import box_iou
+from detectron2.structures import BoxMode
+from detectron2.data import MetadataCatalog
+from detectron2.utils.comm import all_gather, gather, is_main_process, synchronize
+from detectron2.evaluation.evaluator import DatasetEvaluator
+class InteractiveEvaluator(DatasetEvaluator):
+    """
+    Evaluate point interactive IoU metrics.
+    """
+    def __init__(
+        self,
+        dataset_name,
+        output_dir,
+        max_clicks=20,
+        iou_iter=1,
+        compute_box=False,
+        distributed=True,
+    ):
+        self._logger = logging.getLogger(__name__)
+        self._dataset_name = dataset_name
+        self._distributed = distributed
+        self._cpu_device = torch.device("cpu")
+        self._output_dir = output_dir
+        self.max_clicks = max_clicks
+        self.iou_iter = iou_iter
+        meta = MetadataCatalog.get(dataset_name)
+    def reset(self):
+        self.iou_list = []
+        self.num_samples = 0
+        self.all_ious = [0.5, 0.8, 0.85, 0.9]
+    def process(self, inputs, outputs):
+        self.iou_list += [o['mask_iou'] for o in outputs]
+        self.num_samples += len(outputs)
+    def compute_noc(self):
+        def _get_noc(iou_arr, iou_thr):
+            vals = iou_arr >= iou_thr
+            return vals.max(dim=0)[1].item() + 1 if vals.any() else self.max_clicks
+        noc_list = {}
+        for iou_thr in self.all_ious:
+            scores_arr = [_get_noc(iou_arr, iou_thr) for iou_arr in self.iou_list]
+            noc_list[str(iou_thr)] = scores_arr
+        iou_before_max_iter = torch.stack(self.iou_list)[:,self.iou_iter-1]
+        noc_list_sum = {key:sum(value)*1.0 for key, value in noc_list.items()}
+        if self._distributed:
+            num_samples = sum(all_gather(self.num_samples))
+            noc_list_sum_gather = all_gather(noc_list_sum)
+            iou_before_max_gather = all_gather(iou_before_max_iter.sum().cpu())
+            noc_list_sum = {key: 0 for key in noc_list_sum_gather[0]}
+            for nlg in noc_list_sum_gather:
+                for key, value in nlg.items():
+                    noc_list_sum[key] += value
+        pred_noc = {}
+        if self._distributed and (not is_main_process()):
+            return pred_noc
+        for key, value in noc_list_sum.items():
+            pred_noc[key] = value / num_samples
+        pred_noc['iou_max_iter'] = sum([x.item() for x in iou_before_max_gather]) / num_samples
+        return pred_noc
+    def evaluate(self):
+        pred_noc = self.compute_noc()
+        if self._distributed and (not is_main_process()):
+            return
+        def draw_iou_curve(iou_list, save_dir):
+            iou_list = torch.stack(iou_list, dim=0)
+            iou_list = iou_list.mean(dim=0).cpu().numpy()
+            # draw iou curve, with x-axis as number of clicks, y-axis as iou using matplotlib
+            import matplotlib.pyplot as plt
+            plt.figure()
+            plt.plot(range(1, self.max_clicks+1), iou_list)
+            plt.xlabel('Number of clicks')
+            plt.ylabel('IoU')
+            # create directory if not exist
+            import os
+            output_dir = os.path.join(save_dir, 'iou_by_clicks')
+            if not os.path.exists(output_dir):
+                os.makedirs(output_dir)
+            # get current time and format in 10 digits
+            import time
+            current_time = time.time()
+            current_time = int(current_time)
+            current_time = str(current_time)
+            # save iou curve
+            plt.savefig(os.path.join(output_dir, '{}.png'.format(current_time)))
+        draw_iou_curve(self.iou_list, self._output_dir)
+        results = {}
+        for idx in range(len(self.all_ious)):
+            result_str = 'noc@{}'.format(self.all_ious[idx])
+            results[result_str] = pred_noc[str(self.all_ious[idx])]
+        results['miou@iter{}'.format(self.iou_iter)] = pred_noc['iou_max_iter']
+        self._logger.info(results)
+        return {'interactive': results}

datasets/evaluation/panoptic_evaluation.py ADDED Viewed

	@@ -0,0 +1,199 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import contextlib
+import io
+import itertools
+import json
+import logging
+import numpy as np
+import os
+import tempfile
+from collections import OrderedDict
+from typing import Optional
+from PIL import Image
+from tabulate import tabulate
+from detectron2.data import MetadataCatalog
+from detectron2.utils import comm
+from detectron2.utils.file_io import PathManager
+from detectron2.evaluation.evaluator import DatasetEvaluator
+logger = logging.getLogger(__name__)
+class COCOPanopticEvaluator(DatasetEvaluator):
+    """
+    Evaluate Panoptic Quality metrics on COCO using PanopticAPI.
+    It saves panoptic segmentation prediction in `output_dir`
+    It contains a synchronize call and has to be called from all workers.
+    """
+    def __init__(self, dataset_name: str, output_dir: Optional[str] = None):
+        """
+        Args:
+            dataset_name: name of the dataset
+            output_dir: output directory to save results for evaluation.
+        """
+        self._metadata = MetadataCatalog.get(dataset_name)
+        self._thing_contiguous_id_to_dataset_id = {
+            v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items()
+        }
+        self._stuff_contiguous_id_to_dataset_id = {
+            v: k for k, v in self._metadata.stuff_dataset_id_to_contiguous_id.items()
+        }
+        self._output_dir = output_dir
+        if self._output_dir is not None:
+            PathManager.mkdirs(self._output_dir)
+    def reset(self):
+        self._predictions = []
+    def _convert_category_id(self, segment_info):
+        isthing = segment_info.pop("isthing", None)
+        if isthing is None:
+            # the model produces panoptic category id directly. No more conversion needed
+            return segment_info
+        if isthing is True:
+            segment_info["category_id"] = self._thing_contiguous_id_to_dataset_id[
+                segment_info["category_id"]
+            ]
+        else:
+            segment_info["category_id"] = self._stuff_contiguous_id_to_dataset_id[
+                segment_info["category_id"]
+            ]
+        return segment_info
+    def process(self, inputs, outputs):
+        from panopticapi.utils import id2rgb
+        for input, output in zip(inputs, outputs):
+            panoptic_img, segments_info = output["panoptic_seg"]
+            panoptic_img = panoptic_img.cpu().numpy()
+            if segments_info is None:
+                # If "segments_info" is None, we assume "panoptic_img" is a
+                # H*W int32 image storing the panoptic_id in the format of
+                # category_id * label_divisor + instance_id. We reserve -1 for
+                # VOID label, and add 1 to panoptic_img since the official
+                # evaluation script uses 0 for VOID label.
+                label_divisor = self._metadata.label_divisor
+                segments_info = []
+                for panoptic_label in np.unique(panoptic_img):
+                    if panoptic_label == -1:
+                        # VOID region.
+                        continue
+                    pred_class = panoptic_label // label_divisor
+                    isthing = (
+                        pred_class in self._metadata.thing_dataset_id_to_contiguous_id.values()
+                    )
+                    segments_info.append(
+                        {
+                            "id": int(panoptic_label) + 1,
+                            "category_id": int(pred_class),
+                            "isthing": bool(isthing),
+                        }
+                    )
+                # Official evaluation script uses 0 for VOID label.
+                panoptic_img += 1
+            file_name = os.path.basename(input["file_name"])
+            file_name_png = os.path.splitext(file_name)[0] + ".png"
+            with io.BytesIO() as out:
+                Image.fromarray(id2rgb(panoptic_img)).save(out, format="PNG")
+                segments_info = [self._convert_category_id(x) for x in segments_info]
+                self._predictions.append(
+                    {
+                        "image_id": input["image_id"],
+                        "file_name": file_name_png,
+                        "png_string": out.getvalue(),
+                        "segments_info": segments_info,
+                    }
+                )
+    def evaluate(self):
+        comm.synchronize()
+        self._predictions = comm.gather(self._predictions)
+        self._predictions = list(itertools.chain(*self._predictions))
+        if not comm.is_main_process():
+            return
+        # PanopticApi requires local files
+        gt_json = PathManager.get_local_path(self._metadata.panoptic_json)
+        gt_folder = PathManager.get_local_path(self._metadata.panoptic_root)
+        with tempfile.TemporaryDirectory(prefix="panoptic_eval") as pred_dir:
+            logger.info("Writing all panoptic predictions to {} ...".format(pred_dir))
+            for p in self._predictions:
+                with open(os.path.join(pred_dir, p["file_name"]), "wb") as f:
+                    f.write(p.pop("png_string"))
+            with open(gt_json, "r") as f:
+                json_data = json.load(f)
+            json_data["annotations"] = self._predictions
+            output_dir = self._output_dir or pred_dir
+            predictions_json = os.path.join(output_dir, "predictions.json")
+            with PathManager.open(predictions_json, "w") as f:
+                f.write(json.dumps(json_data))
+            from panopticapi.evaluation import pq_compute
+            with contextlib.redirect_stdout(io.StringIO()):
+                pq_res = pq_compute(
+                    gt_json,
+                    PathManager.get_local_path(predictions_json),
+                    gt_folder=gt_folder,
+                    pred_folder=pred_dir,
+                )
+        res = {}
+        res["PQ"] = 100 * pq_res["All"]["pq"]
+        res["SQ"] = 100 * pq_res["All"]["sq"]
+        res["RQ"] = 100 * pq_res["All"]["rq"]
+        res["PQ_th"] = 100 * pq_res["Things"]["pq"]
+        res["SQ_th"] = 100 * pq_res["Things"]["sq"]
+        res["RQ_th"] = 100 * pq_res["Things"]["rq"]
+        res["PQ_st"] = 100 * pq_res["Stuff"]["pq"]
+        res["SQ_st"] = 100 * pq_res["Stuff"]["sq"]
+        res["RQ_st"] = 100 * pq_res["Stuff"]["rq"]
+        results = OrderedDict({"panoptic_seg": res})
+        _print_panoptic_results(pq_res)
+        return results
+def _print_panoptic_results(pq_res):
+    headers = ["", "PQ", "SQ", "RQ", "#categories"]
+    data = []
+    for name in ["All", "Things", "Stuff"]:
+        row = [name] + [pq_res[name][k] * 100 for k in ["pq", "sq", "rq"]] + [pq_res[name]["n"]]
+        data.append(row)
+    table = tabulate(
+        data, headers=headers, tablefmt="pipe", floatfmt=".3f", stralign="center", numalign="center"
+    )
+    logger.info("Panoptic Evaluation Results:\n" + table)
+if __name__ == "__main__":
+    from detectron2.utils.logger import setup_logger
+    logger = setup_logger()
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--gt-json")
+    parser.add_argument("--gt-dir")
+    parser.add_argument("--pred-json")
+    parser.add_argument("--pred-dir")
+    args = parser.parse_args()
+    from panopticapi.evaluation import pq_compute
+    with contextlib.redirect_stdout(io.StringIO()):
+        pq_res = pq_compute(
+            args.gt_json, args.pred_json, gt_folder=args.gt_dir, pred_folder=args.pred_dir
+        )
+        _print_panoptic_results(pq_res)

datasets/evaluation/retrieval_evaluation.py ADDED Viewed

	@@ -0,0 +1,260 @@

+# --------------------------------------------------------
+# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Modified by Xueyan Zou ([email protected]), Ziyi Dou ([email protected])
+# --------------------------------------------------------
+import copy
+import itertools
+import logging
+from collections import OrderedDict
+import torch
+from pycocotools.cocoeval import COCOeval
+import detectron2.utils.comm as comm
+from detectron2.evaluation.evaluator import DatasetEvaluator
+try:
+    from detectron2.evaluation.fast_eval_api import COCOeval_opt
+except ImportError:
+    COCOeval_opt = COCOeval
+class RetrievalEvaluator(DatasetEvaluator):
+    """
+    Evaluate AR for object proposals, AP for instance detection/segmentation, AP
+    for keypoint detection outputs using COCO's metrics.
+    See http://cocodataset.org/#detection-eval and
+    http://cocodataset.org/#keypoints-eval to understand its metrics.
+    The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means
+    the metric cannot be computed (e.g. due to no predictions made).
+    In addition to COCO, this evaluator is able to support any bounding box detection,
+    instance segmentation, or keypoint detection dataset.
+    """
+    def __init__(
+        self,
+        dataset_name=None,
+        output_dir=None,
+        ensemble=False,
+        distributed=True,
+    ):
+        """
+        Args:
+            dataset_name (str): name of the dataset to be evaluated.
+                It must have either the following corresponding metadata:
+                    "json_file": the path to the COCO format annotation
+                Or it must be in detectron2's standard dataset format
+                so it can be converted to COCO format automatically.
+            tasks (tuple[str]): tasks that can be evaluated under the given
+                configuration. A task is one of "bbox", "segm", "keypoints".
+                By default, will infer this automatically from predictions.
+            distributed (True): if True, will collect results from all ranks and run evaluation
+                in the main process.
+                Otherwise, will only evaluate the results in the current process.
+            output_dir (str): optional, an output directory to dump all
+                results predicted on the dataset. The dump contains two files:
+                1. "instances_predictions.pth" a file that can be loaded with `torch.load` and
+                   contains all the results in the format they are produced by the model.
+                2. "coco_instances_results.json" a json file in COCO's result format.
+            max_dets_per_image (int): limit on the maximum number of detections per image.
+                By default in COCO, this limit is to 100, but this can be customized
+                to be greater, as is needed in evaluation metrics AP fixed and AP pool
+                (see https://arxiv.org/pdf/2102.01066.pdf)
+                This doesn't affect keypoint evaluation.
+            use_fast_impl (bool): use a fast but **unofficial** implementation to compute AP.
+                Although the results should be very close to the official implementation in COCO
+                API, it is still recommended to compute results with the official API for use in
+                papers. The faster implementation also uses more RAM.
+            kpt_oks_sigmas (list[float]): The sigmas used to calculate keypoint OKS.
+                See http://cocodataset.org/#keypoints-eval
+                When empty, it will use the defaults in COCO.
+                Otherwise it should be the same length as ROI_KEYPOINT_HEAD.NUM_KEYPOINTS.
+            allow_cached_coco (bool): Whether to use cached coco json from previous validation
+                runs. You should set this to False if you need to use different validation data.
+                Defaults to True.
+        """
+        self._logger = logging.getLogger(__name__)
+        self._dataset_name = dataset_name
+        self._output_dir = output_dir
+        self._ensemble = ensemble
+        self._distributed = distributed
+        if 'p2i' in dataset_name:
+            self.mode = 'patch2image'
+        elif 'interactive2i' in dataset_name:
+            self.mode = 'interactive2image'
+        else:
+            self.mode = 'default'
+    def reset(self):
+        self._text_embeds = []
+        self._image_embeds = []
+        self._image_embeds2 = []
+        self._text_ids = []
+        self._image_ids = []
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
+                It is a list of dict. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name", "image_id".
+            outputs: the outputs of a COCO model. It is a list of dicts with key
+                "instances" that contains :class:`Instances`.
+        """
+        for output in outputs:
+            self._text_ids.extend(output['caption']['caption_ids'])
+            self._image_ids.append(output['caption']['image_ids'])
+            self._text_embeds.append(output['caption']['text_embeds'])
+            self._image_embeds.append(output['caption']['image_embeds'][0])
+            if self._ensemble:
+                self._image_embeds2.append(output['caption']['image_embeds'][1])
+    def evaluate(self, img_ids=None):
+        if self.mode == 'default':
+            return self.evaluate_default(img_ids)
+        elif self.mode in ['patch2image', 'interactive2image']:
+            return self.evaluate_p2i(img_ids)
+        else:
+            assert False, "Unknown mode for retrieval evaluation"
+    def evaluate_default(self, img_ids=None):
+        """
+        Args:
+            img_ids: a list of image IDs to evaluate on. Default to None for the whole dataset
+        """
+        if self._distributed:
+            comm.synchronize()
+            def gather(x, move=False):
+                x = comm.gather(x)
+                x = list(itertools.chain(*x))
+                if move:
+                    x = [xx.to(self._text_embeds[0].device) for xx in x]
+                return x
+            text_embeds = gather(self._text_embeds, move=True)
+            image_embeds = gather(self._image_embeds, move=True)
+            if self._ensemble:
+                image_embeds2 = gather(self._image_embeds2, move=True)
+            text_ids = gather(self._text_ids)
+            image_ids = gather(self._image_ids)
+            if not comm.is_main_process():
+                return {}
+        else:
+            text_embeds = self._text_embeds
+            image_embeds = self._image_embeds
+            if self._ensemble:
+                image_embeds2 = self._image_embeds2
+            text_ids = self._text_ids
+            image_ids = self._image_ids
+        if len(text_embeds) == 0:
+            self._logger.warning("[COCOCaptionEvaluator] Did not receive valid predictions.")
+            return {}
+        iids = torch.tensor(image_ids).view(-1).cuda()
+        tiids = torch.tensor(text_ids).view(-1).cuda()
+        image_embeds = torch.cat(image_embeds)
+        text_embeds = torch.cat(text_embeds)
+        image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)
+        scores = image_embeds @ text_embeds.t()
+        if self._ensemble:
+            image_embeds2 = torch.cat(image_embeds2)
+            image_embeds2 = image_embeds2 / image_embeds2.norm(dim=-1, keepdim=True)
+            scores2 = image_embeds2 @ text_embeds.t()
+            scores = scores2 * 0.5 + scores * 0.5
+        topk10 = scores.topk(10, dim=1)
+        topk5 = scores.topk(5, dim=1)
+        topk1 = scores.topk(1, dim=1)
+        topk10_iids = tiids[topk10.indices]
+        topk5_iids = tiids[topk5.indices]
+        topk1_iids = tiids[topk1.indices]
+        tr_r10 = (iids.unsqueeze(1) == topk10_iids).float().max(dim=1)[0].mean()
+        tr_r5 = (iids.unsqueeze(1) == topk5_iids).float().max(dim=1)[0].mean()
+        tr_r1 = (iids.unsqueeze(1) == topk1_iids).float().max(dim=1)[0].mean()
+        topk10 = scores.topk(10, dim=0)
+        topk5 = scores.topk(5, dim=0)
+        topk1 = scores.topk(1, dim=0)
+        topk10_iids = iids[topk10.indices]
+        topk5_iids = iids[topk5.indices]
+        topk1_iids = iids[topk1.indices]
+        ir_r10 = (tiids.unsqueeze(0) == topk10_iids).float().max(dim=0)[0].mean()
+        ir_r5 = (tiids.unsqueeze(0) == topk5_iids).float().max(dim=0)[0].mean()
+        ir_r1 = (tiids.unsqueeze(0) == topk1_iids).float().max(dim=0)[0].mean()
+        self._results = OrderedDict()
+        # Copy so the caller can do whatever with results
+        self._results['recall'] = {}
+        self._results['recall']['irtr'] = float("{:.3f}".format((ir_r1 + tr_r1).item() * 100))
+        self._results['recall']['ir1'] = float("{:.3f}".format(ir_r1.item() * 100))
+        self._results['recall']['ir5'] = float("{:.3f}".format(ir_r5.item() * 100))
+        self._results['recall']['ir10'] = float("{:.3f}".format(ir_r10.item() * 100))
+        self._results['recall']['tr1'] = float("{:.3f}".format(tr_r1.item() * 100))
+        self._results['recall']['tr5'] = float("{:.3f}".format(tr_r5.item() * 100))
+        self._results['recall']['tr10'] = float("{:.3f}".format(tr_r10.item() * 100))
+        self._logger.info(self._results)
+        return copy.deepcopy(self._results)
+    def evaluate_p2i(self, img_ids=None):
+        """
+        Args:
+            img_ids: a list of image IDs to evaluate on. Default to None for the whole dataset
+        """
+        if self._distributed:
+            comm.synchronize()
+            def gather(x, move=False):
+                x = comm.gather(x)
+                x = list(itertools.chain(*x))
+                if move:
+                    x = [xx.to(self._text_embeds[0].device) for xx in x]
+                return x
+            text_embeds = gather(self._text_embeds, move=True)
+            image_embeds = gather(self._image_embeds, move=True)
+            image_embeds2 = gather(self._image_embeds2, move=True)
+            text_ids = gather(self._text_ids)
+            image_ids = gather(self._image_ids)
+            if not comm.is_main_process():
+                return {}
+        else:
+            text_embeds = self._text_embeds
+            image_embeds = self._image_embeds
+            image_embeds2 = self._image_embeds2
+            text_ids = self._text_ids
+            image_ids = self._image_ids
+        if len(text_embeds) == 0:
+            self._logger.warning("[COCOCaptionEvaluator] Did not receive valid predictions.")
+            return {}
+        iids = torch.tensor(image_ids).view(-1).cuda()
+        tiids = torch.tensor(text_ids).view(-1).cuda()
+        image_embeds = torch.cat(image_embeds)
+        text_embeds = torch.cat(text_embeds)
+        image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)
+        image_embeds2 = torch.cat(image_embeds2)
+        image_embeds2 = image_embeds2 / image_embeds2.norm(dim=-1, keepdim=True)
+        # compute image to image retrieval
+        self._results = OrderedDict()
+        self._results['recall'] = {}
+        ii_scores = image_embeds2 @ image_embeds.t()
+        topk10 = ii_scores.topk(10, dim=1)
+        topk5 = ii_scores.topk(5, dim=1)
+        topk1 = ii_scores.topk(1, dim=1)
+        topk10_iids = iids[topk10.indices]
+        topk5_iids = iids[topk5.indices]
+        topk1_iids = iids[topk1.indices]
+        iir_r10 = (iids.unsqueeze(1) == topk10_iids).float().max(dim=1)[0].mean()
+        iir_r5 = (iids.unsqueeze(1) == topk5_iids).float().max(dim=1)[0].mean()
+        iir_r1 = (iids.unsqueeze(1) == topk1_iids).float().max(dim=1)[0].mean()
+        # Copy so the caller can do whatever with results
+        self._results['recall']['p2ir1'] = float("{:.3f}".format(iir_r1.item() * 100))
+        self._results['recall']['p2ir5'] = float("{:.3f}".format(iir_r5.item() * 100))
+        self._results['recall']['p2ir10'] = float("{:.3f}".format(iir_r10.item() * 100))
+        self._logger.info(self._results)
+        return copy.deepcopy(self._results)

datasets/evaluation/segmentation_evaluation.py ADDED Viewed

	@@ -0,0 +1,195 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import itertools
+import json
+import logging
+import numpy as np
+import os
+from collections import OrderedDict
+import PIL.Image as Image
+import pycocotools.mask as mask_util
+import torch
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.utils.comm import all_gather, is_main_process
+from detectron2.utils.file_io import PathManager
+from detectron2.evaluation.evaluator import DatasetEvaluator
+from utilities.distributed import synchronize
+from ..semseg_loader import load_semseg
+class SemSegEvaluator(DatasetEvaluator):
+    """
+    Evaluate semantic segmentation metrics.
+    """
+    def __init__(
+        self,
+        dataset_name,
+        distributed=True,
+        output_dir=None,
+        *,
+        num_classes=None,
+        ignore_label=None,
+    ):
+        """
+        Args:
+            dataset_name (str): name of the dataset to be evaluated.
+            distributed (bool): if True, will collect results from all ranks for evaluation.
+                Otherwise, will evaluate the results in the current process.
+            output_dir (str): an output directory to dump results.
+            num_classes, ignore_label: deprecated argument
+        """
+        self._logger = logging.getLogger(__name__)
+        if num_classes is not None:
+            self._logger.warn(
+                "SemSegEvaluator(num_classes) is deprecated! It should be obtained from metadata."
+            )
+        if ignore_label is not None:
+            self._logger.warn(
+                "SemSegEvaluator(ignore_label) is deprecated! It should be obtained from metadata."
+            )
+        self._dataset_name = dataset_name
+        self._distributed = distributed
+        self._output_dir = output_dir
+        self._cpu_device = torch.device("cpu")
+        self.input_file_to_gt_file = {
+            dataset_record["file_name"]: dataset_record["sem_seg_file_name"]
+            for dataset_record in DatasetCatalog.get(dataset_name)
+        }
+        meta = MetadataCatalog.get(dataset_name)
+        # Dict that maps contiguous training ids to COCO category ids
+        try:
+            c2d = meta.stuff_dataset_id_to_contiguous_id
+            self._contiguous_id_to_dataset_id = {v: k for k, v in c2d.items()}
+        except AttributeError:
+            self._contiguous_id_to_dataset_id = None
+        self._class_names = meta.stuff_classes
+        self._class_offset = meta.class_offset if hasattr(meta, 'class_offset') else 0
+        self._num_classes = len(meta.stuff_classes)
+        self._semseg_loader = meta.semseg_loader if hasattr(meta, 'semseg_loader') else 'PIL'
+        if num_classes is not None:
+            assert self._num_classes == num_classes, f"{self._num_classes} != {num_classes}"
+        self._ignore_label = ignore_label if ignore_label is not None else meta.ignore_label
+    def reset(self):
+        self._conf_matrix = np.zeros((self._num_classes + 1, self._num_classes + 1), dtype=np.int64)
+        self._predictions = []
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a model.
+                It is a list of dicts. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name".
+            outputs: the outputs of a model. It is either list of semantic segmentation predictions
+                (Tensor [H, W]) or list of dicts with key "sem_seg" that contains semantic
+                segmentation prediction in the same format.
+        """
+        for input, output in zip(inputs, outputs):
+            output = output["sem_seg"].argmax(dim=0).to(self._cpu_device)
+            pred = np.array(output, dtype=np.int)
+            with PathManager.open(self.input_file_to_gt_file[input["file_name"]], "rb") as f:
+                gt = load_semseg(f, self._semseg_loader) - self._class_offset
+            if isinstance(self._ignore_label, int):
+                ignore_label = self._ignore_label - self._class_offset
+                gt[gt == self._ignore_label] = self._num_classes
+            elif isinstance(self._ignore_label, list):
+                for ignore_label in self._ignore_label:
+                    ignore_label = ignore_label - self._class_offset
+                    gt[gt == ignore_label] = self._num_classes
+            self._conf_matrix += np.bincount(
+                (self._num_classes + 1) * pred.reshape(-1) + gt.reshape(-1),
+                minlength=self._conf_matrix.size,
+            ).reshape(self._conf_matrix.shape)
+            self._predictions.extend(self.encode_json_sem_seg(pred, input["file_name"]))
+    def evaluate(self):
+        """
+        Evaluates standard semantic segmentation metrics (http://cocodataset.org/#stuff-eval):
+        * Mean intersection-over-union averaged across classes (mIoU)
+        * Frequency Weighted IoU (fwIoU)
+        * Mean pixel accuracy averaged across classes (mACC)
+        * Pixel Accuracy (pACC)
+        """
+        if self._distributed:
+            synchronize()
+            conf_matrix_list = all_gather(self._conf_matrix)
+            self._predictions = all_gather(self._predictions)
+            self._predictions = list(itertools.chain(*self._predictions))
+            if not is_main_process():
+                return
+            self._conf_matrix = np.zeros_like(self._conf_matrix)
+            for conf_matrix in conf_matrix_list:
+                self._conf_matrix += conf_matrix
+        if self._output_dir:
+            PathManager.mkdirs(self._output_dir)
+            file_path = os.path.join(self._output_dir, "sem_seg_predictions.json")
+            with PathManager.open(file_path, "w") as f:
+                f.write(json.dumps(self._predictions))
+        acc = np.full(self._num_classes, np.nan, dtype=np.float)
+        iou = np.full(self._num_classes, np.nan, dtype=np.float)
+        tp = self._conf_matrix.diagonal()[:-1].astype(np.float)
+        pos_gt = np.sum(self._conf_matrix[:-1, :-1], axis=0).astype(np.float)
+        class_weights = pos_gt / np.sum(pos_gt)
+        pos_pred = np.sum(self._conf_matrix[:-1, :-1], axis=1).astype(np.float)
+        acc_valid = pos_gt > 0
+        acc[acc_valid] = tp[acc_valid] / pos_gt[acc_valid]
+        iou_valid = (pos_gt + pos_pred) > 0
+        union = pos_gt + pos_pred - tp
+        iou[acc_valid] = tp[acc_valid] / union[acc_valid]
+        macc = np.sum(acc[acc_valid]) / np.sum(acc_valid)
+        miou = np.sum(iou[acc_valid]) / np.sum(iou_valid)
+        fiou = np.sum(iou[acc_valid] * class_weights[acc_valid])
+        pacc = np.sum(tp) / np.sum(pos_gt)
+        res = {}
+        res["mIoU"] = 100 * miou
+        res["fwIoU"] = 100 * fiou
+        for i, name in enumerate(self._class_names):
+            res["IoU-{}".format(name)] = 100 * iou[i]
+        res["mACC"] = 100 * macc
+        res["pACC"] = 100 * pacc
+        for i, name in enumerate(self._class_names):
+            res["ACC-{}".format(name)] = 100 * acc[i]
+        if self._output_dir:
+            file_path = os.path.join(self._output_dir, "sem_seg_evaluation.pth")
+            with PathManager.open(file_path, "wb") as f:
+                torch.save(res, f)
+        results = OrderedDict({"sem_seg": res})
+        self._logger.info(results)
+        return results
+    def encode_json_sem_seg(self, sem_seg, input_file_name):
+        """
+        Convert semantic segmentation to COCO stuff format with segments encoded as RLEs.
+        See http://cocodataset.org/#format-results
+        """
+        json_list = []
+        for label in np.unique(sem_seg):
+            if self._contiguous_id_to_dataset_id is not None:
+                assert (
+                    label in self._contiguous_id_to_dataset_id
+                ), "Label {} is not in the metadata info for {}".format(label, self._dataset_name)
+                dataset_id = self._contiguous_id_to_dataset_id[label]
+            else:
+                dataset_id = int(label)
+            mask = (sem_seg == label).astype(np.uint8)
+            mask_rle = mask_util.encode(np.array(mask[:, :, None], order="F"))[0]
+            mask_rle["counts"] = mask_rle["counts"].decode("utf-8")
+            json_list.append(
+                {"file_name": input_file_name, "category_id": dataset_id, "segmentation": mask_rle}
+            )
+        return json_list

datasets/refer.py ADDED Viewed

	@@ -0,0 +1,371 @@

+__author__ = 'licheng'
+"""
+This interface provides access to four datasets:
+1) refclef
+2) refcoco
+3) refcoco+
+4) refcocog
+split by unc and google
+The following API functions are defined:
+REFER      - REFER api class
+getRefIds  - get ref ids that satisfy given filter conditions.
+getAnnIds  - get ann ids that satisfy given filter conditions.
+getImgIds  - get image ids that satisfy given filter conditions.
+getCatIds  - get category ids that satisfy given filter conditions.
+loadRefs   - load refs with the specified ref ids.
+loadAnns   - load anns with the specified ann ids.
+loadImgs   - load images with the specified image ids.
+loadCats   - load category names with the specified category ids.
+getRefBox  - get ref's bounding box [x, y, w, h] given the ref_id
+showRef    - show image, segmentation or box of the referred object with the ref
+getMask    - get mask and area of the referred object given ref
+showMask   - show mask of the referred object given ref
+"""
+from doctest import REPORT_ONLY_FIRST_FAILURE
+import sys
+import os.path as osp
+import json
+import pickle
+import time
+import itertools
+import skimage.io as io
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon, Rectangle
+from pprint import pprint
+import numpy as np
+from pycocotools import mask
+# import cv2
+# from skimage.measure import label, regionprops
+class REFER:
+    def __init__(self, data_root, dataset='refcoco', splitBy='unc'):
+        # provide data_root folder which contains refclef, refcoco, refcoco+ and refcocog
+        # also provide dataset name and splitBy information
+        # e.g., dataset = 'refcoco', splitBy = 'unc'
+        print('loading dataset {} into memory...'.format(dataset))
+        self.ROOT_DIR = osp.abspath(osp.dirname(__file__))
+        self.DATA_DIR = osp.join(data_root, dataset)
+        if dataset in ['refcoco', 'refcoco+', 'refcocog']:
+            self.IMAGE_DIR = osp.join(data_root, 'images/mscoco/images/train2014')
+        elif dataset == 'refclef':
+            self.IMAGE_DIR = osp.join(data_root, 'images/saiapr_tc-12')
+        else:
+            print('No refer dataset is called [{}]'.format(dataset))
+            sys.exit()
+        # load refs from data/dataset/refs(dataset).json
+        tic = time.time()
+        ref_file = osp.join(self.DATA_DIR, 'refs('+splitBy+').p')
+        self.data = {}
+        self.data['dataset'] = dataset
+        self.data['refs'] = pickle.load(open(ref_file, 'rb'))
+        # load annotations from data/dataset/instances.json
+        instances_file = osp.join(self.DATA_DIR, 'instances.json')
+        instances = json.load(open(instances_file, 'r'))
+        self.data['images'] = instances['images']
+        self.data['annotations'] = instances['annotations']
+        self.data['categories'] = instances['categories']
+        # create index
+        self.createIndex()
+        print('DONE (t=%.2fs)'.format(time.time()-tic))
+    def createIndex(self):
+        # create sets of mapping
+        # 1)  Refs: 	 	{ref_id: ref}
+        # 2)  Anns: 	 	{ann_id: ann}
+        # 3)  Imgs:		 	{image_id: image}
+        # 4)  Cats: 	 	{category_id: category_name}
+        # 5)  Sents:     	{sent_id: sent}
+        # 6)  imgToRefs: 	{image_id: refs}
+        # 7)  imgToAnns: 	{image_id: anns}
+        # 8)  refToAnn:  	{ref_id: ann}
+        # 9)  annToRef:  	{ann_id: ref}
+        # 10) catToRefs: 	{category_id: refs}
+        # 11) sentToRef: 	{sent_id: ref}
+        # 12) sentToTokens: {sent_id: tokens}
+        print('creating index...')
+        # fetch info from instances
+        Anns, Imgs, Cats, imgToAnns = {}, {}, {}, {}
+        for ann in self.data['annotations']:
+            Anns[ann['id']] = ann
+            imgToAnns[ann['image_id']] = imgToAnns.get(
+                ann['image_id'], []) + [ann]
+        for img in self.data['images']:
+            Imgs[img['id']] = img
+        for cat in self.data['categories']:
+            Cats[cat['id']] = cat['name']
+        # fetch info from refs
+        Refs, imgToRefs, refToAnn, annToRef, catToRefs = {}, {}, {}, {}, {}
+        Sents, sentToRef, sentToTokens = {}, {}, {}
+        for ref in self.data['refs']:
+            # ids
+            ref_id = ref['ref_id']
+            ann_id = ref['ann_id']
+            category_id = ref['category_id']
+            image_id = ref['image_id']
+            # add mapping related to ref
+            Refs[ref_id] = ref
+            imgToRefs[image_id] = imgToRefs.get(image_id, []) + [ref]
+            catToRefs[category_id] = catToRefs.get(category_id, []) + [ref]
+            refToAnn[ref_id] = Anns[ann_id]
+            annToRef[ann_id] = ref
+            # add mapping of sent
+            for sent in ref['sentences']:
+                Sents[sent['sent_id']] = sent
+                sentToRef[sent['sent_id']] = ref
+                sentToTokens[sent['sent_id']] = sent['tokens']
+        # create class members
+        self.Refs = Refs
+        self.Anns = Anns
+        self.Imgs = Imgs
+        self.Cats = Cats
+        self.Sents = Sents
+        self.imgToRefs = imgToRefs
+        self.imgToAnns = imgToAnns
+        self.refToAnn = refToAnn
+        self.annToRef = annToRef
+        self.catToRefs = catToRefs
+        self.sentToRef = sentToRef
+        self.sentToTokens = sentToTokens
+        print('index created.')
+    def getRefIds(self, image_ids=[], cat_ids=[], ref_ids=[], split=''):
+        image_ids = image_ids if type(image_ids) == list else [image_ids]
+        cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
+        ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+        if len(image_ids) == len(cat_ids) == len(ref_ids) == len(split) == 0:
+            refs = self.data['refs']
+        else:
+            if not len(image_ids) == 0:
+                refs = [self.imgToRefs[image_id] for image_id in image_ids]
+            else:
+                refs = self.data['refs']
+            if not len(cat_ids) == 0:
+                refs = [ref for ref in refs if ref['category_id'] in cat_ids]
+            if not len(ref_ids) == 0:
+                refs = [ref for ref in refs if ref['ref_id'] in ref_ids]
+            if not len(split) == 0:
+                if split in ['testA', 'testB', 'testC']:
+                    # we also consider testAB, testBC, ...
+                    refs = [ref for ref in refs if split[-1] in ref['split']]
+                elif split in ['testAB', 'testBC', 'testAC']:
+                    # rarely used I guess...
+                    refs = [ref for ref in refs if ref['split'] == split]
+                elif split == 'test':
+                    refs = [ref for ref in refs if 'test' in ref['split']]
+                elif split == 'train' or split == 'val':
+                    refs = [ref for ref in refs if ref['split'] == split]
+                else:
+                    print('No such split [{}]'.format(split))
+                    sys.exit()
+        ref_ids = [ref['ref_id'] for ref in refs]
+        return ref_ids
+    def getAnnIds(self, image_ids=[], cat_ids=[], ref_ids=[]):
+        image_ids = image_ids if type(image_ids) == list else [image_ids]
+        cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
+        ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+        if len(image_ids) == len(cat_ids) == len(ref_ids) == 0:
+            ann_ids = [ann['id'] for ann in self.data['annotations']]
+        else:
+            if not len(image_ids) == 0:
+                lists = [self.imgToAnns[image_id]
+                         for image_id in image_ids if image_id in self.imgToAnns]  # list of [anns]
+                anns = list(itertools.chain.from_iterable(lists))
+            else:
+                anns = self.data['annotations']
+            if not len(cat_ids) == 0:
+                anns = [ann for ann in anns if ann['category_id'] in cat_ids]
+            ann_ids = [ann['id'] for ann in anns]
+            if not len(ref_ids) == 0:
+                ids = set(ann_ids).intersection(
+                    set([self.Refs[ref_id]['ann_id'] for ref_id in ref_ids]))
+        return ann_ids
+    def getImgIds(self, ref_ids=[]):
+        ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+        if not len(ref_ids) == 0:
+            image_ids = list(set([self.Refs[ref_id]['image_id']
+                             for ref_id in ref_ids]))
+        else:
+            image_ids = self.Imgs.keys()
+        return image_ids
+    def getCatIds(self):
+        return self.Cats.keys()
+    def loadRefs(self, ref_ids=[]):
+        if type(ref_ids) == list:
+            return [self.Refs[ref_id] for ref_id in ref_ids]
+        elif type(ref_ids) == int:
+            return [self.Refs[ref_ids]]
+    def loadAnns(self, ann_ids=[]):
+        if type(ann_ids) == list:
+            return [self.Anns[ann_id] for ann_id in ann_ids]
+        elif type(ann_ids) == int or type(ann_ids) == unicode:
+            return [self.Anns[ann_ids]]
+    def loadImgs(self, image_ids=[]):
+        if type(image_ids) == list:
+            return [self.Imgs[image_id] for image_id in image_ids]
+        elif type(image_ids) == int:
+            return [self.Imgs[image_ids]]
+    def loadCats(self, cat_ids=[]):
+        if type(cat_ids) == list:
+            return [self.Cats[cat_id] for cat_id in cat_ids]
+        elif type(cat_ids) == int:
+            return [self.Cats[cat_ids]]
+    def getRefBox(self, ref_id):
+        ref = self.Refs[ref_id]
+        ann = self.refToAnn[ref_id]
+        return ann['bbox']  # [x, y, w, h]
+    def showRef(self, ref, seg_box='seg'):
+        ax = plt.gca()
+        # show image
+        image = self.Imgs[ref['image_id']]
+        I = io.imread(osp.join(self.IMAGE_DIR, image['file_name']))
+        ax.imshow(I)
+        # show refer expression
+        for sid, sent in enumerate(ref['sentences']):
+            print('{}. {}'.format(sid+1, sent['sent']))
+        # show segmentations
+        if seg_box == 'seg':
+            ann_id = ref['ann_id']
+            ann = self.Anns[ann_id]
+            polygons = []
+            color = []
+            c = 'none'
+            if type(ann['segmentation'][0]) == list:
+                # polygon used for refcoco*
+                for seg in ann['segmentation']:
+                    poly = np.array(seg).reshape((len(seg)/2, 2))
+                    polygons.append(Polygon(poly, True, alpha=0.4))
+                    color.append(c)
+                p = PatchCollection(polygons, facecolors=color, edgecolors=(
+                    1, 1, 0, 0), linewidths=3, alpha=1)
+                ax.add_collection(p)  # thick yellow polygon
+                p = PatchCollection(polygons, facecolors=color, edgecolors=(
+                    1, 0, 0, 0), linewidths=1, alpha=1)
+                ax.add_collection(p)  # thin red polygon
+            else:
+                # mask used for refclef
+                rle = ann['segmentation']
+                m = mask.decode(rle)
+                img = np.ones((m.shape[0], m.shape[1], 3))
+                color_mask = np.array([2.0, 166.0, 101.0])/255
+                for i in range(3):
+                    img[:, :, i] = color_mask[i]
+                ax.imshow(np.dstack((img, m*0.5)))
+        # show bounding-box
+        elif seg_box == 'box':
+            ann_id = ref['ann_id']
+            ann = self.Anns[ann_id]
+            bbox = self.getRefBox(ref['ref_id'])
+            box_plot = Rectangle(
+                (bbox[0], bbox[1]), bbox[2], bbox[3], fill=False, edgecolor='green', linewidth=3)
+            ax.add_patch(box_plot)
+    def getMask(self, ref):
+        # return mask, area and mask-center
+        ann = self.refToAnn[ref['ref_id']]
+        image = self.Imgs[ref['image_id']]
+        if type(ann['segmentation'][0]) == list:  # polygon
+            rle = mask.frPyObjects(
+                ann['segmentation'], image['height'], image['width'])
+        else:
+            rle = ann['segmentation']
+        m = mask.decode(rle)
+        # sometimes there are multiple binary map (corresponding to multiple segs)
+        m = np.sum(m, axis=2)
+        m = m.astype(np.uint8)  # convert to np.uint8
+        # compute area
+        area = sum(mask.area(rle))  # should be close to ann['area']
+        return {'mask': m, 'area': area}
+        # # position
+        # position_x = np.mean(np.where(m==1)[1]) # [1] means columns (matlab style) -> x (c style)
+        # position_y = np.mean(np.where(m==1)[0]) # [0] means rows (matlab style)    -> y (c style)
+        # # mass position (if there were multiple regions, we use the largest one.)
+        # label_m = label(m, connectivity=m.ndim)
+        # regions = regionprops(label_m)
+        # if len(regions) > 0:
+        # 	largest_id = np.argmax(np.array([props.filled_area for props in regions]))
+        # 	largest_props = regions[largest_id]
+        # 	mass_y, mass_x = largest_props.centroid
+        # else:
+        # 	mass_x, mass_y = position_x, position_y
+        # # if centroid is not in mask, we find the closest point to it from mask
+        # if m[mass_y, mass_x] != 1:
+        # 	print 'Finding closes mask point ...'
+        # 	kernel = np.ones((10, 10),np.uint8)
+        # 	me = cv2.erode(m, kernel, iterations = 1)
+        # 	points = zip(np.where(me == 1)[0].tolist(), np.where(me == 1)[1].tolist())  # row, col style
+        # 	points = np.array(points)
+        # 	dist   = np.sum((points - (mass_y, mass_x))**2, axis=1)
+        # 	id     = np.argsort(dist)[0]
+        # 	mass_y, mass_x = points[id]
+        # 	# return
+        # return {'mask': m, 'area': area, 'position_x': position_x, 'position_y': position_y, 'mass_x': mass_x, 'mass_y': mass_y}
+        # # show image and mask
+        # I = io.imread(osp.join(self.IMAGE_DIR, image['file_name']))
+        # plt.figure()
+        # plt.imshow(I)
+        # ax = plt.gca()
+        # img = np.ones( (m.shape[0], m.shape[1], 3) )
+        # color_mask = np.array([2.0,166.0,101.0])/255
+        # for i in range(3):
+        #     img[:,:,i] = color_mask[i]
+        # ax.imshow(np.dstack( (img, m*0.5) ))
+        # plt.show()
+    def showMask(self, ref):
+        M = self.getMask(ref)
+        msk = M['mask']
+        ax = plt.gca()
+        ax.imshow(msk)
+if __name__ == '__main__':
+    refer = REFER(data_root='/home/xueyanz/code/dataset/refcocoseg',
+                  dataset='refcocog', splitBy='google')
+    ref_ids = refer.getRefIds()
+    print(len(ref_ids))
+    print(len(refer.Imgs))
+    print(len(refer.imgToRefs))
+    ref_ids = refer.getRefIds(split='train')
+    print('There are {} training referred objects.' % len(ref_ids))
+    for ref_id in ref_ids:
+        ref = refer.loadRefs(ref_id)[0]
+        if len(ref['sentences']) < 2:
+            continue
+        pprint(ref)
+        print('The label is {}.'.format(refer.Cats[ref['category_id']]))
+        # plt.figure()
+        # refer.showRef(ref, seg_box='box')
+        # plt.show()
+        # plt.figure()
+        # refer.showMask(ref)
+        # plt.show()

datasets/registration/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from . import (
+    register_biomed_datasets
+)

datasets/registration/register_biomed_datasets.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# --------------------------------------------------------
+# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Modified by Xueyan Zou ([email protected])
+# --------------------------------------------------------
+import json
+import os
+import collections
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.data.datasets import load_sem_seg
+from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
+from detectron2.utils.file_io import PathManager
+_PREDEFINED_SPLITS_BIOMED = {}
+# example of registering a dataset
+datasets = ['BiomedParseData-Demo', ]   # provide name of the dataset under biomedparse_datasets
+splits = ['demo']    # provide split name, e.g., train, test, val. Here there is only one 'demo' split in the example demo dataset
+# Here we register all the splits of the dataset
+for name in datasets:
+    for split in splits:
+        dataname = f'biomed_{name.replace("/", "-")}_{split}'
+        image_root = f"{name}/{split}"
+        ann_root = f"{name}/{split}.json"
+        _PREDEFINED_SPLITS_BIOMED[dataname] = (image_root, ann_root)
+# The resulting dataset name is: biomed_BiomedParseData-Demo_demo
+# # Add your dataset here
+# datasets = ['YOUR_DATASET_NAME', ]   # provide name of the dataset under biomedparse_datasets
+# splits = ['train', 'test']    # provide split name, e.g., train, test, val
+# # Here we register all the splits of the dataset
+# for name in datasets:
+#     for split in splits:
+#         dataname = f'biomed_{name.replace("/", "-")}_{split}'
+#         image_root = f"{name}/{split}"
+#         ann_root = f"{name}/{split}.json"
+#         _PREDEFINED_SPLITS_BIOMED[dataname] = (image_root, ann_root)
+# # The resulting dataset names are: biomed_YOUR_DATASET_NAME_train, biomed_YOUR_DATASET_NAME_test
+def get_metadata():
+    meta = {}
+    return meta
+def load_biomed_json(image_root, annot_json, metadata):
+    """
+    Args:
+        image_dir (str): path to the raw dataset. e.g., "~/coco/train2017".
+        gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017".
+        json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json".
+    Returns:
+        list[dict]: a list of dicts in Detectron2 standard format. (See
+        `Using Custom Datasets </tutorials/datasets.html>`_ )
+    """
+    with PathManager.open(annot_json) as f:
+        json_info = json.load(f)
+    # build dictionary for grounding
+    grd_dict = collections.defaultdict(list)
+    for grd_ann in json_info['annotations']:
+        image_id = int(grd_ann["image_id"])
+        grd_dict[image_id].append(grd_ann)
+    mask_root = image_root + '_mask'
+    ret = []
+    for image in json_info["images"]:
+        image_id = int(image["id"])
+        image_file = os.path.join(image_root, image['file_name'])
+        grounding_anno = grd_dict[image_id]
+        for ann in grounding_anno:
+            if 'mask_file' not in ann:
+                ann['mask_file'] = image['file_name']
+            ann['mask_file'] = os.path.join(mask_root, ann['mask_file'])
+            ret.append(
+                {
+                    "file_name": image_file,
+                    "image_id": image_id,
+                    "grounding_info": [ann],
+                }
+            )
+    assert len(ret), f"No images found in {image_root}!"
+    assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
+    return ret
+def register_biomed(
+    name, metadata, image_root, annot_json):
+    DatasetCatalog.register(
+        name,
+        lambda: load_biomed_json(image_root, annot_json, metadata),
+    )
+    MetadataCatalog.get(name).set(
+        image_root=image_root,
+        json_file=annot_json,
+        evaluator_type="grounding_refcoco",
+        ignore_label=255,
+        label_divisor=1000,
+        **metadata,
+    )
+def register_all_biomed(root):
+    for (
+        prefix,
+        (image_root, annot_root),
+    ) in _PREDEFINED_SPLITS_BIOMED.items():
+        register_biomed(
+            prefix,
+            get_metadata(),
+            os.path.join(root, image_root),
+            os.path.join(root, annot_root),
+        )
+_root = os.getenv("DATASET", "datasets")
+register_all_biomed(_root)

datasets/semseg_loader.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from PIL import Image
+import scipy.io
+import numpy as np
+def load_semseg(filename, loader_type):
+    if loader_type == 'PIL':
+        semseg = np.array(Image.open(filename), dtype=np.int)
+    elif loader_type == 'MAT':
+        semseg = scipy.io.loadmat(filename)['LabelMap']
+    return semseg

datasets/utils/refcoco2json.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import os
+import json
+from refer import REFER
+coco_root = '/pth/to/coco'
+ref_root = '/pth/to/refcocoseg'
+coco_train_annot = json.load(open(os.path.join(coco_root, 'annotations/instances_train2017.json')))
+coco_train_id = []
+image_annot = {}
+for i in range(len(coco_train_annot['images'])):
+    coco_train_id.append(coco_train_annot['images'][i]['id'])
+    image_annot[coco_train_annot['images'][i]['id']] = coco_train_annot['images'][i]
+refg = REFER(data_root=ref_root,
+                dataset='refcocog', splitBy='umd')
+refg_val_ids = refg.getRefIds(split='val')
+full_anno = []
+for ref_id in refg_val_ids:
+    ref = refg.loadRefs(ref_id)[0]
+    anno = refg.refToAnn[ref_id]
+    anno.update(ref)
+    full_anno.append(anno)
+imageid_list = []
+final_anno = {}
+for anno in full_anno:
+    imageid_list += [anno['image_id']]
+    final_anno[anno['ann_id']] = anno
+annotations = [value for key, value in final_anno.items()]
+iamges = []
+for image_id in list(set(imageid_list)):
+    iamges += [image_annot[image_id]]
+outputs = {'images': iamges, 'annotations': annotations}
+print(len(iamges))
+print(len(annotations))
+json.dump(outputs, open(os.path.join(coco_root, 'annotations/refcocog_umd_train.json'), 'w'))

datasets/utils/refer.py ADDED Viewed

	@@ -0,0 +1,372 @@

+# This code is modified from https://github.com/lichengunc/refer, and with minor modification of python2/3 format
+__author__ = 'licheng'
+"""
+This interface provides access to four datasets:
+1) refclef
+2) refcoco
+3) refcoco+
+4) refcocog
+split by unc and google
+The following API functions are defined:
+REFER      - REFER api class
+getRefIds  - get ref ids that satisfy given filter conditions.
+getAnnIds  - get ann ids that satisfy given filter conditions.
+getImgIds  - get image ids that satisfy given filter conditions.
+getCatIds  - get category ids that satisfy given filter conditions.
+loadRefs   - load refs with the specified ref ids.
+loadAnns   - load anns with the specified ann ids.
+loadImgs   - load images with the specified image ids.
+loadCats   - load category names with the specified category ids.
+getRefBox  - get ref's bounding box [x, y, w, h] given the ref_id
+showRef    - show image, segmentation or box of the referred object with the ref
+getMask    - get mask and area of the referred object given ref
+showMask   - show mask of the referred object given ref
+"""
+from doctest import REPORT_ONLY_FIRST_FAILURE
+import sys
+import os.path as osp
+import json
+import pickle
+import time
+import itertools
+import skimage.io as io
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon, Rectangle
+from pprint import pprint
+import numpy as np
+from pycocotools import mask
+# import cv2
+# from skimage.measure import label, regionprops
+class REFER:
+    def __init__(self, data_root, dataset='refcoco', splitBy='unc'):
+        # provide data_root folder which contains refclef, refcoco, refcoco+ and refcocog
+        # also provide dataset name and splitBy information
+        # e.g., dataset = 'refcoco', splitBy = 'unc'
+        print('loading dataset {} into memory...'.format(dataset))
+        self.ROOT_DIR = osp.abspath(osp.dirname(__file__))
+        self.DATA_DIR = osp.join(data_root, dataset)
+        if dataset in ['refcoco', 'refcoco+', 'refcocog']:
+            self.IMAGE_DIR = osp.join(data_root, 'images/mscoco/images/train2014')
+        elif dataset == 'refclef':
+            self.IMAGE_DIR = osp.join(data_root, 'images/saiapr_tc-12')
+        else:
+            print('No refer dataset is called [{}]'.format(dataset))
+            sys.exit()
+        # load refs from data/dataset/refs(dataset).json
+        tic = time.time()
+        ref_file = osp.join(self.DATA_DIR, 'refs('+splitBy+').p')
+        self.data = {}
+        self.data['dataset'] = dataset
+        self.data['refs'] = pickle.load(open(ref_file, 'rb'))
+        # load annotations from data/dataset/instances.json
+        instances_file = osp.join(self.DATA_DIR, 'instances.json')
+        instances = json.load(open(instances_file, 'r'))
+        self.data['images'] = instances['images']
+        self.data['annotations'] = instances['annotations']
+        self.data['categories'] = instances['categories']
+        # create index
+        self.createIndex()
+        print('DONE (t=%.2fs)'.format(time.time()-tic))
+    def createIndex(self):
+        # create sets of mapping
+        # 1)  Refs: 	 	{ref_id: ref}
+        # 2)  Anns: 	 	{ann_id: ann}
+        # 3)  Imgs:		 	{image_id: image}
+        # 4)  Cats: 	 	{category_id: category_name}
+        # 5)  Sents:     	{sent_id: sent}
+        # 6)  imgToRefs: 	{image_id: refs}
+        # 7)  imgToAnns: 	{image_id: anns}
+        # 8)  refToAnn:  	{ref_id: ann}
+        # 9)  annToRef:  	{ann_id: ref}
+        # 10) catToRefs: 	{category_id: refs}
+        # 11) sentToRef: 	{sent_id: ref}
+        # 12) sentToTokens: {sent_id: tokens}
+        print('creating index...')
+        # fetch info from instances
+        Anns, Imgs, Cats, imgToAnns = {}, {}, {}, {}
+        for ann in self.data['annotations']:
+            Anns[ann['id']] = ann
+            imgToAnns[ann['image_id']] = imgToAnns.get(
+                ann['image_id'], []) + [ann]
+        for img in self.data['images']:
+            Imgs[img['id']] = img
+        for cat in self.data['categories']:
+            Cats[cat['id']] = cat['name']
+        # fetch info from refs
+        Refs, imgToRefs, refToAnn, annToRef, catToRefs = {}, {}, {}, {}, {}
+        Sents, sentToRef, sentToTokens = {}, {}, {}
+        for ref in self.data['refs']:
+            # ids
+            ref_id = ref['ref_id']
+            ann_id = ref['ann_id']
+            category_id = ref['category_id']
+            image_id = ref['image_id']
+            # add mapping related to ref
+            Refs[ref_id] = ref
+            imgToRefs[image_id] = imgToRefs.get(image_id, []) + [ref]
+            catToRefs[category_id] = catToRefs.get(category_id, []) + [ref]
+            refToAnn[ref_id] = Anns[ann_id]
+            annToRef[ann_id] = ref
+            # add mapping of sent
+            for sent in ref['sentences']:
+                Sents[sent['sent_id']] = sent
+                sentToRef[sent['sent_id']] = ref
+                sentToTokens[sent['sent_id']] = sent['tokens']
+        # create class members
+        self.Refs = Refs
+        self.Anns = Anns
+        self.Imgs = Imgs
+        self.Cats = Cats
+        self.Sents = Sents
+        self.imgToRefs = imgToRefs
+        self.imgToAnns = imgToAnns
+        self.refToAnn = refToAnn
+        self.annToRef = annToRef
+        self.catToRefs = catToRefs
+        self.sentToRef = sentToRef
+        self.sentToTokens = sentToTokens
+        print('index created.')
+    def getRefIds(self, image_ids=[], cat_ids=[], ref_ids=[], split=''):
+        image_ids = image_ids if type(image_ids) == list else [image_ids]
+        cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
+        ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+        if len(image_ids) == len(cat_ids) == len(ref_ids) == len(split) == 0:
+            refs = self.data['refs']
+        else:
+            if not len(image_ids) == 0:
+                refs = [self.imgToRefs[image_id] for image_id in image_ids]
+            else:
+                refs = self.data['refs']
+            if not len(cat_ids) == 0:
+                refs = [ref for ref in refs if ref['category_id'] in cat_ids]
+            if not len(ref_ids) == 0:
+                refs = [ref for ref in refs if ref['ref_id'] in ref_ids]
+            if not len(split) == 0:
+                if split in ['testA', 'testB', 'testC']:
+                    # we also consider testAB, testBC, ...
+                    refs = [ref for ref in refs if split[-1] in ref['split']]
+                elif split in ['testAB', 'testBC', 'testAC']:
+                    # rarely used I guess...
+                    refs = [ref for ref in refs if ref['split'] == split]
+                elif split == 'test':
+                    refs = [ref for ref in refs if 'test' in ref['split']]
+                elif split == 'train' or split == 'val':
+                    refs = [ref for ref in refs if ref['split'] == split]
+                else:
+                    print('No such split [{}]'.format(split))
+                    sys.exit()
+        ref_ids = [ref['ref_id'] for ref in refs]
+        return ref_ids
+    def getAnnIds(self, image_ids=[], cat_ids=[], ref_ids=[]):
+        image_ids = image_ids if type(image_ids) == list else [image_ids]
+        cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
+        ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+        if len(image_ids) == len(cat_ids) == len(ref_ids) == 0:
+            ann_ids = [ann['id'] for ann in self.data['annotations']]
+        else:
+            if not len(image_ids) == 0:
+                lists = [self.imgToAnns[image_id]
+                         for image_id in image_ids if image_id in self.imgToAnns]  # list of [anns]
+                anns = list(itertools.chain.from_iterable(lists))
+            else:
+                anns = self.data['annotations']
+            if not len(cat_ids) == 0:
+                anns = [ann for ann in anns if ann['category_id'] in cat_ids]
+            ann_ids = [ann['id'] for ann in anns]
+            if not len(ref_ids) == 0:
+                ids = set(ann_ids).intersection(
+                    set([self.Refs[ref_id]['ann_id'] for ref_id in ref_ids]))
+        return ann_ids
+    def getImgIds(self, ref_ids=[]):
+        ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+        if not len(ref_ids) == 0:
+            image_ids = list(set([self.Refs[ref_id]['image_id']
+                             for ref_id in ref_ids]))
+        else:
+            image_ids = self.Imgs.keys()
+        return image_ids
+    def getCatIds(self):
+        return self.Cats.keys()
+    def loadRefs(self, ref_ids=[]):
+        if type(ref_ids) == list:
+            return [self.Refs[ref_id] for ref_id in ref_ids]
+        elif type(ref_ids) == int:
+            return [self.Refs[ref_ids]]
+    def loadAnns(self, ann_ids=[]):
+        if type(ann_ids) == list:
+            return [self.Anns[ann_id] for ann_id in ann_ids]
+        elif type(ann_ids) == int or type(ann_ids) == unicode:
+            return [self.Anns[ann_ids]]
+    def loadImgs(self, image_ids=[]):
+        if type(image_ids) == list:
+            return [self.Imgs[image_id] for image_id in image_ids]
+        elif type(image_ids) == int:
+            return [self.Imgs[image_ids]]
+    def loadCats(self, cat_ids=[]):
+        if type(cat_ids) == list:
+            return [self.Cats[cat_id] for cat_id in cat_ids]
+        elif type(cat_ids) == int:
+            return [self.Cats[cat_ids]]
+    def getRefBox(self, ref_id):
+        ref = self.Refs[ref_id]
+        ann = self.refToAnn[ref_id]
+        return ann['bbox']  # [x, y, w, h]
+    def showRef(self, ref, seg_box='seg'):
+        ax = plt.gca()
+        # show image
+        image = self.Imgs[ref['image_id']]
+        I = io.imread(osp.join(self.IMAGE_DIR, image['file_name']))
+        ax.imshow(I)
+        # show refer expression
+        for sid, sent in enumerate(ref['sentences']):
+            print('{}. {}'.format(sid+1, sent['sent']))
+        # show segmentations
+        if seg_box == 'seg':
+            ann_id = ref['ann_id']
+            ann = self.Anns[ann_id]
+            polygons = []
+            color = []
+            c = 'none'
+            if type(ann['segmentation'][0]) == list:
+                # polygon used for refcoco*
+                for seg in ann['segmentation']:
+                    poly = np.array(seg).reshape((len(seg)/2, 2))
+                    polygons.append(Polygon(poly, True, alpha=0.4))
+                    color.append(c)
+                p = PatchCollection(polygons, facecolors=color, edgecolors=(
+                    1, 1, 0, 0), linewidths=3, alpha=1)
+                ax.add_collection(p)  # thick yellow polygon
+                p = PatchCollection(polygons, facecolors=color, edgecolors=(
+                    1, 0, 0, 0), linewidths=1, alpha=1)
+                ax.add_collection(p)  # thin red polygon
+            else:
+                # mask used for refclef
+                rle = ann['segmentation']
+                m = mask.decode(rle)
+                img = np.ones((m.shape[0], m.shape[1], 3))
+                color_mask = np.array([2.0, 166.0, 101.0])/255
+                for i in range(3):
+                    img[:, :, i] = color_mask[i]
+                ax.imshow(np.dstack((img, m*0.5)))
+        # show bounding-box
+        elif seg_box == 'box':
+            ann_id = ref['ann_id']
+            ann = self.Anns[ann_id]
+            bbox = self.getRefBox(ref['ref_id'])
+            box_plot = Rectangle(
+                (bbox[0], bbox[1]), bbox[2], bbox[3], fill=False, edgecolor='green', linewidth=3)
+            ax.add_patch(box_plot)
+    def getMask(self, ref):
+        # return mask, area and mask-center
+        ann = self.refToAnn[ref['ref_id']]
+        image = self.Imgs[ref['image_id']]
+        if type(ann['segmentation'][0]) == list:  # polygon
+            rle = mask.frPyObjects(
+                ann['segmentation'], image['height'], image['width'])
+        else:
+            rle = ann['segmentation']
+        m = mask.decode(rle)
+        # sometimes there are multiple binary map (corresponding to multiple segs)
+        m = np.sum(m, axis=2)
+        m = m.astype(np.uint8)  # convert to np.uint8
+        # compute area
+        area = sum(mask.area(rle))  # should be close to ann['area']
+        return {'mask': m, 'area': area}
+        # # position
+        # position_x = np.mean(np.where(m==1)[1]) # [1] means columns (matlab style) -> x (c style)
+        # position_y = np.mean(np.where(m==1)[0]) # [0] means rows (matlab style)    -> y (c style)
+        # # mass position (if there were multiple regions, we use the largest one.)
+        # label_m = label(m, connectivity=m.ndim)
+        # regions = regionprops(label_m)
+        # if len(regions) > 0:
+        # 	largest_id = np.argmax(np.array([props.filled_area for props in regions]))
+        # 	largest_props = regions[largest_id]
+        # 	mass_y, mass_x = largest_props.centroid
+        # else:
+        # 	mass_x, mass_y = position_x, position_y
+        # # if centroid is not in mask, we find the closest point to it from mask
+        # if m[mass_y, mass_x] != 1:
+        # 	print 'Finding closes mask point ...'
+        # 	kernel = np.ones((10, 10),np.uint8)
+        # 	me = cv2.erode(m, kernel, iterations = 1)
+        # 	points = zip(np.where(me == 1)[0].tolist(), np.where(me == 1)[1].tolist())  # row, col style
+        # 	points = np.array(points)
+        # 	dist   = np.sum((points - (mass_y, mass_x))**2, axis=1)
+        # 	id     = np.argsort(dist)[0]
+        # 	mass_y, mass_x = points[id]
+        # 	# return
+        # return {'mask': m, 'area': area, 'position_x': position_x, 'position_y': position_y, 'mass_x': mass_x, 'mass_y': mass_y}
+        # # show image and mask
+        # I = io.imread(osp.join(self.IMAGE_DIR, image['file_name']))
+        # plt.figure()
+        # plt.imshow(I)
+        # ax = plt.gca()
+        # img = np.ones( (m.shape[0], m.shape[1], 3) )
+        # color_mask = np.array([2.0,166.0,101.0])/255
+        # for i in range(3):
+        #     img[:,:,i] = color_mask[i]
+        # ax.imshow(np.dstack( (img, m*0.5) ))
+        # plt.show()
+    def showMask(self, ref):
+        M = self.getMask(ref)
+        msk = M['mask']
+        ax = plt.gca()
+        ax.imshow(msk)
+if __name__ == '__main__':
+    refer = REFER(data_root='/home/xueyanz/code/dataset/refcocoseg',
+                  dataset='refcocog', splitBy='google')
+    ref_ids = refer.getRefIds()
+    print(len(ref_ids))
+    print(len(refer.Imgs))
+    print(len(refer.imgToRefs))
+    ref_ids = refer.getRefIds(split='train')
+    print('There are {} training referred objects.' % len(ref_ids))
+    for ref_id in ref_ids:
+        ref = refer.loadRefs(ref_id)[0]
+        if len(ref['sentences']) < 2:
+            continue
+        pprint(ref)
+        print('The label is {}.'.format(refer.Cats[ref['category_id']]))
+        # plt.figure()
+        # refer.showRef(ref, seg_box='box')
+        # plt.show()
+        # plt.figure()
+        # refer.showMask(ref)
+        # plt.show()

datasets/visual_sampler/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from .sampler import ShapeSampler
+from .simpleclick_sampler import SimpleClickSampler
+def build_shape_sampler(cfg, **kwargs):
+    sampler_name = cfg['STROKE_SAMPLER']['EVAL']['MODE']
+    if sampler_name == 'random':
+        return ShapeSampler(cfg, **kwargs)
+    elif sampler_name in ['best', 'best_random']:
+        return SimpleClickSampler(cfg, **kwargs)
+    else:
+        assert False, "not implemented"

datasets/visual_sampler/circle.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import random
+import torch
+from .mask_generators import get_mask_by_input_strokes
+class Circle:
+    def __init__(self, cfg, is_train=True):
+        self.num_stroke = cfg['STROKE_SAMPLER']['CIRCLE']['NUM_STROKES']
+        self.stroke_preset = cfg['STROKE_SAMPLER']['CIRCLE']['STROKE_PRESET']
+        self.stroke_prob = cfg['STROKE_SAMPLER']['CIRCLE']['STROKE_PROB']
+        self.max_eval = cfg['STROKE_SAMPLER']['EVAL']['MAX_ITER']
+        self.is_train = is_train
+    @staticmethod
+    def get_stroke_preset(stroke_preset):
+        if stroke_preset == 'object_like':
+            return {
+                "nVertexBound": [5, 30],
+                "maxHeadSpeed": 15,
+                "maxHeadAcceleration": (10, 1.5),
+                "brushWidthBound": (20, 50),
+                "nMovePointRatio": 0.5,
+                "maxPiontMove": 10,
+                "maxLineAcceleration": (5, 0.5),
+                "boarderGap": None,
+                "maxInitSpeed": 10,
+            }
+        elif stroke_preset == 'object_like_middle':
+            return {
+                "nVertexBound": [5, 15],
+                "maxHeadSpeed": 8,
+                "maxHeadAcceleration": (4, 1.5),
+                "brushWidthBound": (20, 50),
+                "nMovePointRatio": 0.5,
+                "maxPiontMove": 5,
+                "maxLineAcceleration": (5, 0.5),
+                "boarderGap": None,
+                "maxInitSpeed": 10,
+            }
+        elif stroke_preset == 'object_like_small':
+            return {
+                "nVertexBound": [5, 20],
+                "maxHeadSpeed": 7,
+                "maxHeadAcceleration": (3.5, 1.5),
+                "brushWidthBound": (10, 30),
+                "nMovePointRatio": 0.5,
+                "maxPiontMove": 5,
+                "maxLineAcceleration": (3, 0.5),
+                "boarderGap": None,
+                "maxInitSpeed": 4,
+            }
+        else:
+            raise NotImplementedError(f'The stroke presetting "{stroke_preset}" does not exist.')
+    def get_random_points_from_mask(self, mask, n=5):
+        h,w = mask.shape
+        view_mask = mask.reshape(h*w)
+        non_zero_idx = view_mask.nonzero()[:,0]
+        selected_idx = torch.randperm(len(non_zero_idx))[:n]
+        non_zero_idx = non_zero_idx[selected_idx]
+        y = (non_zero_idx // w)*1.0
+        x = (non_zero_idx % w)*1.0
+        return torch.cat((x[:,None], y[:,None]), dim=1).numpy()
+    def draw(self, mask=None, box=None):
+        if mask.sum() < 10: # if mask is nearly empty
+            return torch.zeros(mask.shape).bool()
+        if not self.is_train:
+            return self.draw_eval(mask=mask, box=box)
+        stroke_preset_name = random.choices(self.stroke_preset, weights=self.stroke_prob, k=1)[0] # select which kind of object to use
+        preset = Circle.get_stroke_preset(stroke_preset_name)
+        nStroke = min(random.randint(1, self.num_stroke), mask.sum().item())
+        h,w = mask.shape
+        points = self.get_random_points_from_mask(mask, n=nStroke)
+        rand_mask = get_mask_by_input_strokes(
+            init_points=points,
+            imageWidth=w, imageHeight=h, nStroke=min(nStroke, len(points)), **preset)
+        rand_mask = (~torch.from_numpy(rand_mask)) * mask
+        return rand_mask
+    def draw_eval(self, mask=None, box=None):
+        stroke_preset_name = random.choices(self.stroke_preset, weights=self.stroke_prob, k=1)[0] # select which kind of object to use
+        preset = Circle.get_stroke_preset(stroke_preset_name)
+        nStroke = min(self.max_eval, mask.sum().item())
+        h,w = mask.shape
+        points = self.get_random_points_from_mask(mask, n=nStroke)
+        rand_masks = []
+        for i in range(len(points)):
+            rand_mask = get_mask_by_input_strokes(
+                init_points=points[:i+1],
+                imageWidth=w, imageHeight=h, nStroke=min(nStroke, len(points[:i+1])), **preset)
+            rand_masks += [(~torch.from_numpy(rand_mask)) * mask]
+        return torch.stack(rand_masks)
+    @staticmethod
+    def draw_by_points(points, mask, h, w):
+        stroke_preset_name = random.choices(['object_like', 'object_like_middle', 'object_like_small'], weights=[0.33,0.33,0.33], k=1)[0] # select which kind of object to use
+        preset = Circle.get_stroke_preset(stroke_preset_name)
+        rand_mask = get_mask_by_input_strokes(
+            init_points=points,
+            imageWidth=w, imageHeight=h, nStroke=len(points), **preset)[None,]
+        rand_masks = (~torch.from_numpy(rand_mask)) * mask
+        return rand_masks
+    def __repr__(self,):
+        return 'circle'

datasets/visual_sampler/mask_generators.py ADDED Viewed

	@@ -0,0 +1,215 @@

+import numpy as np
+import random
+from PIL import Image, ImageDraw
+def get_mask_by_input_strokes(
+    init_points, imageWidth=320, imageHeight=180, nStroke=5,
+    nVertexBound=[10, 30], maxHeadSpeed=15, maxHeadAcceleration=(15, 0.5),
+    brushWidthBound=(5, 20), boarderGap=None, nMovePointRatio=0.5, maxPiontMove=10,
+    maxLineAcceleration=5, maxInitSpeed=5
+):
+    '''
+    Get video masks by random strokes which move randomly between each
+    frame, including the whole stroke and its control points
+    Parameters
+    ----------
+        imageWidth: Image width
+        imageHeight: Image height
+        nStroke: Number of drawed lines
+        nVertexBound: Lower/upper bound of number of control points for each line
+        maxHeadSpeed: Max head speed when creating control points
+        maxHeadAcceleration: Max acceleration applying on the current head point (
+            a head point and its velosity decides the next point)
+        brushWidthBound (min, max): Bound of width for each stroke
+        boarderGap: The minimum gap between image boarder and drawed lines
+        nMovePointRatio: The ratio of control points to move for next frames
+        maxPiontMove: The magnitude of movement for control points for next frames
+        maxLineAcceleration: The magnitude of acceleration for the whole line
+    Examples
+    ----------
+        object_like_setting = {
+            "nVertexBound": [5, 20],
+            "maxHeadSpeed": 15,
+            "maxHeadAcceleration": (15, 3.14),
+            "brushWidthBound": (30, 50),
+            "nMovePointRatio": 0.5,
+            "maxPiontMove": 10,
+            "maxLineAcceleration": (5, 0.5),
+            "boarderGap": 20,
+            "maxInitSpeed": 10,
+        }
+        rand_curve_setting = {
+            "nVertexBound": [10, 30],
+            "maxHeadSpeed": 20,
+            "maxHeadAcceleration": (15, 0.5),
+            "brushWidthBound": (3, 10),
+            "nMovePointRatio": 0.5,
+            "maxPiontMove": 3,
+            "maxLineAcceleration": (5, 0.5),
+            "boarderGap": 20,
+            "maxInitSpeed": 6
+        }
+        get_video_masks_by_moving_random_stroke(video_len=5, nStroke=3, **object_like_setting)
+    '''
+    # Initilize a set of control points to draw the first mask
+    mask = Image.new(mode='1', size=(imageWidth, imageHeight), color=1)
+    control_points_set = []
+    for i in range(nStroke):
+        brushWidth = np.random.randint(brushWidthBound[0], brushWidthBound[1])
+        Xs, Ys, velocity = get_random_stroke_control_points(
+            init_point=init_points[i],
+            imageWidth=imageWidth, imageHeight=imageHeight,
+            nVertexBound=nVertexBound, maxHeadSpeed=maxHeadSpeed,
+            maxHeadAcceleration=maxHeadAcceleration, boarderGap=boarderGap,
+            maxInitSpeed=maxInitSpeed
+        )
+        control_points_set.append((Xs, Ys, velocity, brushWidth))
+        draw_mask_by_control_points(mask, Xs, Ys, brushWidth, fill=0)
+    # Generate the following masks by randomly move strokes and their control points
+    mask = Image.new(mode='1', size=(imageWidth, imageHeight), color=1)
+    for j in range(len(control_points_set)):
+        Xs, Ys, velocity, brushWidth = control_points_set[j]
+        new_Xs, new_Ys = random_move_control_points(
+            Xs, Ys, velocity, nMovePointRatio, maxPiontMove,
+            maxLineAcceleration, boarderGap
+        )
+        control_points_set[j] = (new_Xs, new_Ys, velocity, brushWidth)
+    for Xs, Ys, velocity, brushWidth in control_points_set:
+        draw_mask_by_control_points(mask, Xs, Ys, brushWidth, fill=0)
+    return np.array(mask)
+def random_accelerate(velocity, maxAcceleration, dist='uniform'):
+    speed, angle = velocity
+    d_speed, d_angle = maxAcceleration
+    if dist == 'uniform':
+        speed += np.random.uniform(-d_speed, d_speed)
+        angle += np.random.uniform(-d_angle, d_angle)
+    elif dist == 'guassian':
+        speed += np.random.normal(0, d_speed / 2)
+        angle += np.random.normal(0, d_angle / 2)
+    else:
+        raise NotImplementedError(f'Distribution type {dist} is not supported.')
+    return (speed, angle)
+def random_move_control_points(Xs, Ys, lineVelocity, nMovePointRatio, maxPiontMove, maxLineAcceleration, boarderGap=15):
+    new_Xs = Xs.copy()
+    new_Ys = Ys.copy()
+    # move the whole line and accelerate
+    speed, angle = lineVelocity
+    new_Xs += int(speed * np.cos(angle))
+    new_Ys += int(speed * np.sin(angle))
+    lineVelocity = random_accelerate(lineVelocity, maxLineAcceleration, dist='guassian')
+    # choose points to move
+    chosen = np.arange(len(Xs))
+    np.random.shuffle(chosen)
+    chosen = chosen[:int(len(Xs) * nMovePointRatio)]
+    for i in chosen:
+        new_Xs[i] += np.random.randint(-maxPiontMove, maxPiontMove)
+        new_Ys[i] += np.random.randint(-maxPiontMove, maxPiontMove)
+    return new_Xs, new_Ys
+def get_random_stroke_control_points(
+    init_point,
+    imageWidth, imageHeight,
+    nVertexBound=(10, 30), maxHeadSpeed=10, maxHeadAcceleration=(5, 0.5), boarderGap=20,
+    maxInitSpeed=10
+):
+    '''
+    Implementation the free-form training masks generating algorithm
+    proposed by JIAHUI YU et al. in "Free-Form Image Inpainting with Gated Convolution"
+    '''
+    startX = init_point[0]
+    startY = init_point[1]
+    Xs = [init_point[0]]
+    Ys = [init_point[1]]
+    numVertex = np.random.randint(nVertexBound[0], nVertexBound[1])
+    angle = np.random.uniform(0, 2 * np.pi)
+    speed = np.random.uniform(0, maxHeadSpeed)
+    for i in range(numVertex):
+        speed, angle = random_accelerate((speed, angle), maxHeadAcceleration)
+        speed = np.clip(speed, 0, maxHeadSpeed)
+        nextX = startX + speed * np.sin(angle)
+        nextY = startY + speed * np.cos(angle)
+        if boarderGap is not None:
+            nextX = np.clip(nextX, boarderGap, imageWidth - boarderGap)
+            nextY = np.clip(nextY, boarderGap, imageHeight - boarderGap)
+        startX, startY = nextX, nextY
+        Xs.append(nextX)
+        Ys.append(nextY)
+    velocity = get_random_velocity(maxInitSpeed, dist='guassian')
+    return np.array(Xs), np.array(Ys), velocity
+def get_random_velocity(max_speed, dist='uniform'):
+    if dist == 'uniform':
+        speed = np.random.uniform(max_speed)
+    elif dist == 'guassian':
+        speed = np.abs(np.random.normal(0, max_speed / 2))
+    else:
+        raise NotImplementedError(f'Distribution type {dist} is not supported.')
+    angle = np.random.uniform(0, 2 * np.pi)
+    return (speed, angle)
+def draw_mask_by_control_points(mask, Xs, Ys, brushWidth, fill=255):
+    radius = brushWidth // 2 - 1
+    for i in range(1, len(Xs)):
+        draw = ImageDraw.Draw(mask)
+        startX, startY = Xs[i - 1], Ys[i - 1]
+        nextX, nextY = Xs[i], Ys[i]
+        draw.line((startX, startY) + (nextX, nextY), fill=fill, width=brushWidth)
+    for x, y in zip(Xs, Ys):
+        draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill=fill)
+    return mask
+# modified from https://github.com/naoto0804/pytorch-inpainting-with-partial-conv/blob/master/generate_data.py
+def get_random_walk_mask(imageWidth=320, imageHeight=180, length=None):
+    action_list = [[0, 1], [0, -1], [1, 0], [-1, 0]]
+    canvas = np.zeros((imageHeight, imageWidth)).astype("i")
+    if length is None:
+        length = imageWidth * imageHeight
+    x = random.randint(0, imageHeight - 1)
+    y = random.randint(0, imageWidth - 1)
+    x_list = []
+    y_list = []
+    for i in range(length):
+        r = random.randint(0, len(action_list) - 1)
+        x = np.clip(x + action_list[r][0], a_min=0, a_max=imageHeight - 1)
+        y = np.clip(y + action_list[r][1], a_min=0, a_max=imageWidth - 1)
+        x_list.append(x)
+        y_list.append(y)
+    canvas[np.array(x_list), np.array(y_list)] = 1
+    return Image.fromarray(canvas * 255).convert('1')
+def get_masked_ratio(mask):
+    """
+    Calculate the masked ratio.
+    mask: Expected a binary PIL image, where 0 and 1 represent
+          masked(invalid) and valid pixel values.
+    """
+    hist = mask.histogram()
+    return hist[0] / np.prod(mask.size)

datasets/visual_sampler/point.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import random
+import torch
+import torch.nn.functional as F
+import numpy as np
+from scipy import ndimage
+class Point:
+    def __init__(self, cfg, is_train=True):
+        self.max_points = cfg['STROKE_SAMPLER']['POINT']['NUM_POINTS']
+        self.max_eval = cfg['STROKE_SAMPLER']['EVAL']['MAX_ITER']
+        self.is_train = is_train
+    def draw(self, mask=None, box=None):
+        if mask.sum() < 10:
+            return torch.zeros(mask.shape).bool() # if mask is empty
+        if not self.is_train:
+            return self.draw_eval(mask=mask, box=box)
+        max_points = min(self.max_points, mask.sum().item()) # max number of points no more than total mask number
+        num_points = random.randint(1, max_points) # get a random number of points
+        h,w = mask.shape
+        view_mask = mask.view(-1)
+        non_zero_idx = view_mask.nonzero()[:,0] # get non-zero index of mask
+        selected_idx = torch.randperm(len(non_zero_idx))[:num_points] # select id
+        non_zero_idx = non_zero_idx[selected_idx] # select non-zero index
+        rand_mask = torch.zeros(view_mask.shape).bool() # init rand mask
+        rand_mask[non_zero_idx] = True # get non zero place to zero
+        # dilate
+        # struct = ndimage.generate_binary_structure(2, 2)
+        # rand_mask = torch.from_numpy((ndimage.binary_dilation(rand_mask.reshape(h, w).numpy(), structure=struct, iterations=5).astype(rand_mask.numpy().dtype)))
+        # return rand_mask
+        return rand_mask.reshape(h, w)
+    def draw_eval(self, mask=None, box=None):
+        background = ~mask
+        neg_num = min(self.max_eval // 2, background.sum().item())
+        pos_num = min(self.max_eval - neg_num, mask.sum().item()-1) + 1
+        h,w = mask.shape
+        view_mask = mask.view(-1)
+        non_zero_idx_pos = view_mask.nonzero()[:,0] # get non-zero index of mask
+        selected_idx_pos = torch.randperm(len(non_zero_idx_pos))[:pos_num] # select id
+        non_zero_idx_pos = non_zero_idx_pos[selected_idx_pos] # select non-zero index
+        pos_idx = torch.ones(non_zero_idx_pos.shape)
+        view_background = background.view(-1)
+        non_zero_idx_neg = view_background.nonzero()[:,0] # get non-zero index of mask
+        selected_idx_neg = torch.randperm(len(non_zero_idx_neg))[:neg_num] # select id
+        non_zero_idx_neg = non_zero_idx_neg[selected_idx_neg] # select non-zero index
+        neg_idx = torch.ones(non_zero_idx_neg.shape) * -1
+        non_zero_idx = torch.cat([non_zero_idx_pos, non_zero_idx_neg])
+        idx = torch.cat([pos_idx, neg_idx])
+        rand_idx = torch.cat([torch.zeros(1), torch.randperm(len(non_zero_idx)-1) + 1]).long()
+        non_zero_idx = non_zero_idx[rand_idx]
+        idx = idx[rand_idx]
+        rand_masks = []
+        for i in range(0, len(non_zero_idx)):
+            rand_mask = torch.zeros(view_mask.shape) # init rand mask
+            rand_mask[non_zero_idx[0:i+1]] = idx[0:i+1] # get non zero place to zero
+            # struct = ndimage.generate_binary_structure(2, 2)
+            # rand_mask = torch.from_numpy((ndimage.binary_dilation(rand_mask.reshape(h, w).numpy(), structure=struct, iterations=5).astype(rand_mask.numpy().dtype)))
+            rand_masks += [rand_mask.reshape(h, w)]
+        # kernel_size = 3
+        rand_masks = torch.stack(rand_masks)
+        # rand_masks = F.conv2d(rand_masks[:,None], torch.ones(1,1,kernel_size,kernel_size), padding=kernel_size//2)[:,0]
+        # rand_masks[rand_masks>0] = 1
+        # rand_masks[rand_masks<0] = -1
+        return rand_masks
+    def __repr__(self,):
+        return 'point'

datasets/visual_sampler/polygon.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import random
+import numpy as np
+import torch
+from scipy.special import binom
+from scipy import ndimage
+import matplotlib.pyplot as plt
+from matplotlib.backends.backend_agg import FigureCanvasAgg
+bernstein = lambda n, k, t: binom(n,k)* t**k * (1.-t)**(n-k)
+def bezier(points, num=200):
+    N = len(points)
+    t = np.linspace(0, 1, num=num)
+    curve = np.zeros((num, 2))
+    for i in range(N):
+        curve += np.outer(bernstein(N - 1, i, t), points[i])
+    return curve
+class Segment():
+    def __init__(self, p1, p2, angle1, angle2, **kw):
+        self.p1 = p1; self.p2 = p2
+        self.angle1 = angle1; self.angle2 = angle2
+        self.numpoints = kw.get("numpoints", 100)
+        r = kw.get("r", 0.3)
+        d = np.sqrt(np.sum((self.p2-self.p1)**2))
+        self.r = r*d
+        self.p = np.zeros((4,2))
+        self.p[0,:] = self.p1[:]
+        self.p[3,:] = self.p2[:]
+        self.calc_intermediate_points(self.r)
+    def calc_intermediate_points(self,r):
+        self.p[1,:] = self.p1 + np.array([self.r*np.cos(self.angle1),
+                                    self.r*np.sin(self.angle1)])
+        self.p[2,:] = self.p2 + np.array([self.r*np.cos(self.angle2+np.pi),
+                                    self.r*np.sin(self.angle2+np.pi)])
+        self.curve = bezier(self.p,self.numpoints)
+def get_curve(points, **kw):
+    segments = []
+    for i in range(len(points)-1):
+        seg = Segment(points[i,:2], points[i+1,:2], points[i,2],points[i+1,2],**kw)
+        segments.append(seg)
+    curve = np.concatenate([s.curve for s in segments])
+    return segments, curve
+def ccw_sort(p):
+    d = p-np.mean(p,axis=0)
+    s = np.arctan2(d[:,0], d[:,1])
+    return p[np.argsort(s),:]
+def get_bezier_curve(a, rad=0.2, edgy=0):
+    """ given an array of points *a*, create a curve through
+    those points.
+    *rad* is a number between 0 and 1 to steer the distance of
+          control points.
+    *edgy* is a parameter which controls how "edgy" the curve is,
+           edgy=0 is smoothest."""
+    p = np.arctan(edgy)/np.pi+.5
+    a = ccw_sort(a)
+    a = np.append(a, np.atleast_2d(a[0,:]), axis=0)
+    d = np.diff(a, axis=0)
+    ang = np.arctan2(d[:,1],d[:,0])
+    f = lambda ang : (ang>=0)*ang + (ang<0)*(ang+2*np.pi)
+    ang = f(ang)
+    ang1 = ang
+    ang2 = np.roll(ang,1)
+    ang = p*ang1 + (1-p)*ang2 + (np.abs(ang2-ang1) > np.pi )*np.pi
+    ang = np.append(ang, [ang[0]])
+    a = np.append(a, np.atleast_2d(ang).T, axis=1)
+    s, c = get_curve(a, r=rad, method="var")
+    x,y = c.T
+    return x,y,a
+class Polygon:
+    def __init__(self, cfg, is_train):
+        self.max_points = cfg['STROKE_SAMPLER']['POLYGON']['MAX_POINTS']
+        self.eval_points = cfg['STROKE_SAMPLER']['EVAL']['MAX_ITER']
+        self.is_train = is_train
+    def get_random_points_from_mask(self, mask, n=3):
+        h,w = mask.shape
+        view_mask = mask.reshape(h*w)
+        non_zero_idx = view_mask.nonzero()[:,0]
+        selected_idx = torch.randperm(len(non_zero_idx))[:n]
+        non_zero_idx = non_zero_idx[selected_idx]
+        y = (non_zero_idx // w)*1.0/(h+1)
+        x = (non_zero_idx % w)*1.0/(w+1)
+        return torch.cat((x[:,None],y[:,None]), dim=1).numpy()
+    def draw(self, mask=None, box=None):
+        if mask.sum() < 10:
+            return torch.zeros(mask.shape).bool() # if mask is empty
+        if not self.is_train:
+            return self.draw_eval(mask=mask, box=box)
+        # box: x1,y1,x2,y2
+        x1,y1,x2,y2 = box.int().unbind()
+        rad = 0.2
+        edgy = 0.05
+        num_points = random.randint(1, min(self.max_points, mask.sum().item()))
+        a = self.get_random_points_from_mask(mask[y1:y2,x1:x2], n=num_points)
+        x,y, _ = get_bezier_curve(a,rad=rad, edgy=edgy)
+        x = x.clip(0.0, 1.0)
+        y = y.clip(0.0, 1.0)
+        points = torch.from_numpy(np.concatenate((y[None,]*(y2-y1-1).item(),x[None,]*(x2-x1-1).item()))).int()
+        canvas = torch.zeros((y2-y1, x2-x1))
+        canvas[points.long().tolist()] = 1
+        rand_mask = torch.zeros(mask.shape)
+        rand_mask[y1:y2,x1:x2] = canvas
+        return rand_mask.bool()
+    def draw_eval(self, mask=None, box=None):
+        # box: x1,y1,x2,y2
+        x1,y1,x2,y2 = box.int().unbind()
+        rad = 0.2
+        edgy = 0.05
+        num_points = min(self.eval_points, mask.sum().item())
+        a = self.get_random_points_from_mask(mask[y1:y2,x1:x2], n=num_points)
+        rand_masks = []
+        for i in range(len(a)):
+            x,y, _ = get_bezier_curve(a[:i+1],rad=rad, edgy=edgy)
+            x = x.clip(0.0, 1.0)
+            y = y.clip(0.0, 1.0)
+            points = torch.from_numpy(np.concatenate((y[None,]*(y2-y1-1).item(),x[None,]*(x2-x1-1).item()))).int()
+            canvas = torch.zeros((y2-y1, x2-x1))
+            canvas[points.long().tolist()] = 1
+            rand_mask = torch.zeros(mask.shape)
+            rand_mask[y1:y2,x1:x2] = canvas
+            struct = ndimage.generate_binary_structure(2, 2)
+            rand_mask = torch.from_numpy((ndimage.binary_dilation(rand_mask, structure=struct, iterations=5).astype(rand_mask.numpy().dtype)))
+            rand_masks += [rand_mask.bool()]
+        return torch.stack(rand_masks)
+    def __repr__(self,):
+        return 'polygon'

datasets/visual_sampler/sampler.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import sys
+import random
+import torch
+import torch.nn as nn
+from .point import Point
+from .polygon import Polygon
+from .scribble import Scribble
+from .circle import Circle
+from modeling.utils import configurable
+class ShapeSampler(nn.Module):
+    @configurable
+    def __init__(self, max_candidate=1, shape_prob=[], shape_candidate=[], is_train=True):
+        super().__init__()
+        self.max_candidate = max_candidate
+        self.shape_prob = shape_prob
+        self.shape_candidate = shape_candidate
+        self.is_train = is_train
+    @classmethod
+    def from_config(cls, cfg, is_train=True, mode=None):
+        max_candidate = cfg['STROKE_SAMPLER']['MAX_CANDIDATE']
+        candidate_probs = cfg['STROKE_SAMPLER']['CANDIDATE_PROBS']
+        candidate_names = cfg['STROKE_SAMPLER']['CANDIDATE_NAMES']
+        if mode == 'hack_train':
+            candidate_classes = [getattr(sys.modules[__name__], class_name)(cfg, True) for class_name in candidate_names]
+        else:
+            # overwrite condidate_prob
+            if not is_train:
+                candidate_probs = [0.0 for x in range(len(candidate_names))]
+                candidate_probs[candidate_names.index(mode)] = 1.0
+            candidate_classes = [getattr(sys.modules[__name__], class_name)(cfg, is_train) for class_name in candidate_names]
+        # Build augmentation
+        return {
+            "max_candidate": max_candidate,
+            "shape_prob": candidate_probs,
+            "shape_candidate": candidate_classes,
+            "is_train": is_train,
+        }
+    def forward(self, instances):
+        masks = instances.gt_masks.tensor
+        boxes = instances.gt_boxes.tensor
+        if len(masks) == 0:
+            gt_masks = torch.zeros(masks.shape[-2:]).bool()
+            rand_masks = torch.zeros(masks.shape[-2:]).bool()
+            return {'gt_masks': gt_masks[None,:], 'rand_shape': torch.stack([rand_masks]), 'types': ['none']}
+        indices = [x for x in range(len(masks))]
+        if self.is_train:
+            random.shuffle(indices)
+            candidate_mask = masks[indices[:self.max_candidate]]
+            candidate_box = boxes[indices[:self.max_candidate]]
+        else:
+            candidate_mask = masks
+            candidate_box = boxes
+        draw_funcs = random.choices(self.shape_candidate, weights=self.shape_prob, k=len(candidate_mask))
+        rand_shapes = [d.draw(x,y) for d,x,y in zip(draw_funcs, candidate_mask, candidate_box)]
+        types = [repr(x) for x in draw_funcs]
+        for i in range(0, len(rand_shapes)):
+            if rand_shapes[i].sum() == 0:
+                candidate_mask[i] = candidate_mask[i] * 0
+                types[i] = 'none'
+        # candidate_mask: (c,h,w), bool. rand_shape: (c, iter, h, w), bool. types: list(c)
+        return {'gt_masks': candidate_mask, 'rand_shape': torch.stack(rand_shapes).bool(), 'types': types, 'sampler': self}
+def build_shape_sampler(cfg, **kwargs):
+    return ShapeSampler(cfg, **kwargs)

datasets/visual_sampler/scribble.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import random
+import torch
+from .mask_generators import get_mask_by_input_strokes
+class Scribble:
+    def __init__(self, cfg, is_train):
+        self.num_stroke = cfg['STROKE_SAMPLER']['SCRIBBLE']['NUM_STROKES']
+        self.stroke_preset = cfg['STROKE_SAMPLER']['SCRIBBLE']['STROKE_PRESET']
+        self.stroke_prob = cfg['STROKE_SAMPLER']['SCRIBBLE']['STROKE_PROB']
+        self.eval_stroke = cfg['STROKE_SAMPLER']['EVAL']['MAX_ITER']
+        self.is_train = is_train
+    @staticmethod
+    def get_stroke_preset(stroke_preset):
+        if stroke_preset == 'rand_curve':
+            return {
+                "nVertexBound": [10, 30],
+                "maxHeadSpeed": 20,
+                "maxHeadAcceleration": (15, 0.5),
+                "brushWidthBound": (3, 10),
+                "nMovePointRatio": 0.5,
+                "maxPiontMove": 3,
+                "maxLineAcceleration": (5, 0.5),
+                "boarderGap": None,
+                "maxInitSpeed": 6
+            }
+        elif stroke_preset == 'rand_curve_small':
+            return {
+                "nVertexBound": [6, 22],
+                "maxHeadSpeed": 12,
+                "maxHeadAcceleration": (8, 0.5),
+                "brushWidthBound": (2.5, 5),
+                "nMovePointRatio": 0.5,
+                "maxPiontMove": 1.5,
+                "maxLineAcceleration": (3, 0.5),
+                "boarderGap": None,
+                "maxInitSpeed": 3
+            }
+        else:
+            raise NotImplementedError(f'The stroke presetting "{stroke_preset}" does not exist.')
+    def get_random_points_from_mask(self, mask, n=5):
+        h,w = mask.shape
+        view_mask = mask.reshape(h*w)
+        non_zero_idx = view_mask.nonzero()[:,0]
+        selected_idx = torch.randperm(len(non_zero_idx))[:n]
+        non_zero_idx = non_zero_idx[selected_idx]
+        y = (non_zero_idx // w)*1.0
+        x = (non_zero_idx % w)*1.0
+        return torch.cat((x[:,None], y[:,None]), dim=1).numpy()
+    def draw(self, mask=None, box=None):
+        if mask.sum() < 10:
+            return torch.zeros(mask.shape).bool() # if mask is empty
+        if not self.is_train:
+            return self.draw_eval(mask=mask, box=box)
+        stroke_preset_name = random.choices(self.stroke_preset, weights=self.stroke_prob, k=1)[0]
+        preset = Scribble.get_stroke_preset(stroke_preset_name)
+        nStroke = random.randint(1, min(self.num_stroke, mask.sum().item()))
+        h,w = mask.shape
+        points = self.get_random_points_from_mask(mask, n=nStroke)
+        rand_mask = get_mask_by_input_strokes(
+            init_points=points,
+            imageWidth=w, imageHeight=h, nStroke=min(nStroke, len(points)), **preset)
+        rand_mask = (~torch.from_numpy(rand_mask)) * mask
+        return rand_mask
+    def draw_eval(self, mask=None, box=None):
+        stroke_preset_name = random.choices(self.stroke_preset, weights=self.stroke_prob, k=1)[0]
+        preset = Scribble.get_stroke_preset(stroke_preset_name)
+        nStroke = min(self.eval_stroke, mask.sum().item())
+        h,w = mask.shape
+        points = self.get_random_points_from_mask(mask, n=nStroke)
+        rand_masks = []
+        for i in range(len(points)):
+            rand_mask = get_mask_by_input_strokes(
+                init_points=points[:i+1],
+                imageWidth=w, imageHeight=h, nStroke=min(i, len(points)), **preset)
+            rand_mask = (~torch.from_numpy(rand_mask)) * mask
+            rand_masks += [rand_mask]
+        return torch.stack(rand_masks)
+    @staticmethod
+    def draw_by_points(points, mask, h, w):
+        stroke_preset_name = random.choices(['rand_curve', 'rand_curve_small'], weights=[0.5, 0.5], k=1)[0]
+        preset = Scribble.get_stroke_preset(stroke_preset_name)
+        rand_mask = get_mask_by_input_strokes(
+            init_points=points,
+            imageWidth=w, imageHeight=h, nStroke=len(points), **preset)[None,]
+        rand_masks = (~torch.from_numpy(rand_mask)) * mask
+        return rand_masks
+    def __repr__(self,):
+        return 'scribble'

datasets/visual_sampler/simpleclick_sampler.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import sys
+import random
+import cv2
+import numpy as np
+from scipy import ndimage
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from kornia.contrib import distance_transform
+from .point import Point
+from .polygon import Polygon, get_bezier_curve
+from .scribble import Scribble
+from .circle import Circle
+from modeling.utils import configurable
+class SimpleClickSampler(nn.Module):
+    @configurable
+    def __init__(self, mask_mode='point', sample_negtive=False, is_train=True, dilation=None, dilation_kernel=None, max_points=None):
+        super().__init__()
+        self.mask_mode = mask_mode
+        self.sample_negtive = sample_negtive
+        self.is_train = is_train
+        self.dilation = dilation
+        self.register_buffer("dilation_kernel", dilation_kernel)
+        self.max_points = max_points
+    @classmethod
+    def from_config(cls, cfg, is_train=True, mode=None):
+        mask_mode = mode
+        sample_negtive = cfg['STROKE_SAMPLER']['EVAL']['NEGATIVE']
+        dilation = cfg['STROKE_SAMPLER']['DILATION']
+        dilation_kernel = torch.ones((1, 1, dilation, dilation), device=torch.cuda.current_device())
+        max_points = cfg['STROKE_SAMPLER']['POLYGON']['MAX_POINTS']
+        # Build augmentation
+        return {
+            "mask_mode": mask_mode,
+            "sample_negtive": sample_negtive,
+            "is_train": is_train,
+            "dilation": dilation,
+            "dilation_kernel": dilation_kernel,
+            "max_points": max_points,
+        }
+    def forward_point(self, instances, pred_masks=None, prev_masks=None):
+        gt_masks = instances.gt_masks.tensor
+        n,h,w = gt_masks.shape
+        # We only consider positive points
+        pred_masks = torch.zeros(gt_masks.shape, device=torch.cuda.current_device()).bool() if pred_masks is None else pred_masks[:,:h,:w]
+        prev_masks = torch.zeros(gt_masks.shape, device=torch.cuda.current_device()).bool() if prev_masks is None else prev_masks
+        if not gt_masks.is_cuda:
+            gt_masks = gt_masks.to(pred_masks.device)
+        fp = gt_masks & (~(gt_masks & pred_masks)) & (~prev_masks)
+        # conv implementation
+        mask_dt = (distance_transform((~F.pad(fp[None,], pad=(1, 1, 1, 1), mode='constant', value=0)).float())[0,:,1:-1,1:-1]).reshape(n,-1)
+        max_xy_idx = torch.stack([torch.arange(n), mask_dt.max(dim=-1)[1].cpu()]).tolist()
+        next_mask = torch.zeros(gt_masks.shape, device=torch.cuda.current_device()).bool()
+        next_mask = next_mask.view(n,-1)
+        next_mask[max_xy_idx] = True
+        next_mask = next_mask.reshape((n,h,w)).float()
+        next_mask = F.conv2d(next_mask[None,], self.dilation_kernel.repeat(len(next_mask),1,1,1), padding=self.dilation//2, groups=len(next_mask))[0] > 0
+        # end conv implementation
+        # disk implementation
+        # mask_dt = distance_transform((~fp)[None,].float())[0].view(n,-1)
+        # max_xy = mask_dt.max(dim=-1)[1]
+        # max_y, max_x = max_xy//w, max_xy%w
+        # max_xy_idx = torch.stack([max_y, max_x]).transpose(0,1)[:,:,None,None]
+        # y_idx = torch.arange(start=0, end=h, step=1, dtype=torch.float32, device=torch.cuda.current_device())
+        # x_idx = torch.arange(start=0, end=w, step=1, dtype=torch.float32, device=torch.cuda.current_device())
+        # coord_y, coord_x = torch.meshgrid(y_idx, x_idx)
+        # coords = torch.stack((coord_y, coord_x), dim=0).unsqueeze(0).repeat(len(max_xy_idx),1,1,1) # [bsx2,2,h,w], corresponding to 2d coordinate
+        # coords.add_(-max_xy_idx)
+        # coords.mul_(coords)
+        # next_mask = coords[:, 0] + coords[:, 1]
+        # next_mask = (next_mask <= 5**2)
+        # end disk implementation
+        rand_shapes = prev_masks | next_mask
+        types = ['point' for i in range(len(gt_masks))]
+        return {'gt_masks': instances.gt_masks.tensor, 'rand_shape': rand_shapes[:,None], 'types': types}
+    def forward_circle(self, instances, pred_masks=None, prev_masks=None):
+        gt_masks = instances.gt_masks.tensor
+        n,h,w = gt_masks.shape
+        # We only consider positive points
+        pred_masks = torch.zeros(gt_masks.shape, device=torch.cuda.current_device()).bool() if pred_masks is None else pred_masks[:,:h,:w]
+        prev_masks = torch.zeros(gt_masks.shape, device=torch.cuda.current_device()).bool() if prev_masks is None else prev_masks
+        if not gt_masks.is_cuda:
+            gt_masks = gt_masks.to(pred_masks.device)
+        fp = gt_masks & (~(gt_masks & pred_masks)) & (~prev_masks)
+        # conv implementation
+        mask_dt = (distance_transform((~F.pad(fp[None,], pad=(1, 1, 1, 1), mode='constant', value=0)).float())[0,:,1:-1,1:-1]).reshape(n,-1)
+        max_xy_idx = torch.stack([torch.arange(n), mask_dt.max(dim=-1)[1].cpu()]).tolist()
+        next_mask = torch.zeros(gt_masks.shape, device=torch.cuda.current_device()).bool()
+        next_mask = next_mask.view(n,-1)
+        next_mask[max_xy_idx] = True
+        next_mask = next_mask.reshape((n,h,w)).float()
+        _next_mask = []
+        for idx in range(len(next_mask)):
+            points = next_mask[idx].nonzero().flip(dims=[-1]).cpu().numpy()
+            _next_mask += [Circle.draw_by_points(points, gt_masks[idx:idx+1].cpu(), h, w)]
+        next_mask = torch.cat(_next_mask, dim=0).bool().cuda()
+        rand_shapes = prev_masks | next_mask
+        types = ['circle' for i in range(len(gt_masks))]
+        return {'gt_masks': instances.gt_masks.tensor, 'rand_shape': rand_shapes[:,None], 'types': types}
+    def forward_scribble(self, instances, pred_masks=None, prev_masks=None):
+        gt_masks = instances.gt_masks.tensor
+        n,h,w = gt_masks.shape
+        # We only consider positive points
+        pred_masks = torch.zeros(gt_masks.shape, device=torch.cuda.current_device()).bool() if pred_masks is None else pred_masks[:,:h,:w]
+        prev_masks = torch.zeros(gt_masks.shape, device=torch.cuda.current_device()).bool() if prev_masks is None else prev_masks
+        if not gt_masks.is_cuda:
+            gt_masks = gt_masks.to(pred_masks.device)
+        fp = gt_masks & (~(gt_masks & pred_masks)) & (~prev_masks)
+        # conv implementation
+        mask_dt = (distance_transform((~F.pad(fp[None,], pad=(1, 1, 1, 1), mode='constant', value=0)).float())[0,:,1:-1,1:-1]).reshape(n,-1)
+        max_xy_idx = torch.stack([torch.arange(n), mask_dt.max(dim=-1)[1].cpu()]).tolist()
+        next_mask = torch.zeros(gt_masks.shape, device=torch.cuda.current_device()).bool()
+        next_mask = next_mask.view(n,-1)
+        next_mask[max_xy_idx] = True
+        next_mask = next_mask.reshape((n,h,w)).float()
+        _next_mask = []
+        for idx in range(len(next_mask)):
+            points = next_mask[idx].nonzero().flip(dims=[-1]).cpu().numpy()
+            _next_mask += [Scribble.draw_by_points(points, gt_masks[idx:idx+1].cpu(), h, w)]
+        next_mask = torch.cat(_next_mask, dim=0).bool().cuda()
+        rand_shapes = prev_masks | next_mask
+        types = ['scribble' for i in range(len(gt_masks))]
+        return {'gt_masks': instances.gt_masks.tensor, 'rand_shape': rand_shapes[:,None], 'types': types}
+    def forward_polygon(self, instances, pred_masks=None, prev_masks=None):
+        gt_masks = instances.gt_masks.tensor
+        gt_boxes = instances.gt_boxes.tensor
+        n,h,w = gt_masks.shape
+        # We only consider positive points
+        pred_masks = torch.zeros(gt_masks.shape, device=torch.cuda.current_device()).bool() if pred_masks is None else pred_masks[:,:h,:w]
+        prev_masks = torch.zeros(gt_masks.shape, device=torch.cuda.current_device()).bool() if prev_masks is None else prev_masks
+        if not gt_masks.is_cuda:
+            gt_masks = gt_masks.to(pred_masks.device)
+        fp = gt_masks & (~(gt_masks & pred_masks)) & (~prev_masks)
+        next_mask = []
+        for i in range(len(fp)):
+            rad = 0.2
+            edgy = 0.05
+            num_points = random.randint(1, min(self.max_points, fp[i].sum()))
+            h,w = fp[i].shape
+            view_mask = fp[i].reshape(h*w)
+            non_zero_idx = view_mask.nonzero()[:,0]
+            selected_idx = torch.randperm(len(non_zero_idx))[:num_points]
+            non_zero_idx = non_zero_idx[selected_idx]
+            y = (non_zero_idx // w)*1.0/(h+1)
+            x = (non_zero_idx % w)*1.0/(w+1)
+            coords = torch.cat((x[:,None],y[:,None]), dim=1).cpu().numpy()
+            x1,y1,x2,y2 = gt_boxes[i].int().unbind()
+            x,y, _ = get_bezier_curve(coords, rad=rad, edgy=edgy)
+            x = x.clip(0.0, 1.0)
+            y = y.clip(0.0, 1.0)
+            points = torch.from_numpy(np.concatenate((y[None,]*(y2-y1-1).item(),x[None,]*(x2-x1-1).item()))).int()
+            canvas = torch.zeros((y2-y1, x2-x1))
+            canvas[points.long().tolist()] = 1
+            rand_mask = torch.zeros(fp[i].shape)
+            rand_mask[y1:y2,x1:x2] = canvas
+            next_mask += [rand_mask]
+        next_mask = torch.stack(next_mask).to(pred_masks.device).bool()
+        rand_shapes = prev_masks | next_mask
+        types = ['polygon' for i in range(len(gt_masks))]
+        return {'gt_masks': instances.gt_masks.tensor, 'rand_shape': rand_shapes[:,None], 'types': types}
+    def forward_box(self, instances, pred_masks=None, prev_masks=None):
+        gt_masks = instances.gt_masks.tensor
+        gt_boxes = instances.gt_boxes.tensor
+        n,h,w = gt_masks.shape
+        for i in range(len(gt_masks)):
+            x1,y1,x2,y2 = gt_boxes[i].int().unbind()
+            gt_masks[i,y1:y2,x1:x2] = 1
+        # We only consider positive points
+        pred_masks = torch.zeros(gt_masks.shape, device=torch.cuda.current_device()).bool() if pred_masks is None else pred_masks[:,:h,:w]
+        prev_masks = torch.zeros(gt_masks.shape, device=torch.cuda.current_device()).bool() if prev_masks is None else prev_masks
+        if not gt_masks.is_cuda:
+            gt_masks = gt_masks.to(pred_masks.device)
+        fp = gt_masks & (~(gt_masks & pred_masks)) & (~prev_masks)
+        # conv implementation
+        mask_dt = (distance_transform((~F.pad(fp[None,], pad=(1, 1, 1, 1), mode='constant', value=0)).float())[0,:,1:-1,1:-1]).reshape(n,-1)
+        max_xy_idx = torch.stack([torch.arange(n), mask_dt.max(dim=-1)[1].cpu()]).tolist()
+        next_mask = torch.zeros(gt_masks.shape, device=torch.cuda.current_device()).bool()
+        next_mask = next_mask.view(n,-1)
+        next_mask[max_xy_idx] = True
+        next_mask = next_mask.reshape((n,h,w)).float()
+        next_mask = F.conv2d(next_mask[None,], self.dilation_kernel.repeat(len(next_mask),1,1,1), padding=self.dilation//2, groups=len(next_mask))[0] > 0
+        # end conv implementation
+        rand_shapes = prev_masks | next_mask
+        types = ['box' for i in range(len(gt_masks))]
+        return {'gt_masks': instances.gt_masks.tensor, 'rand_shape': rand_shapes[:,None], 'types': types}
+    def forward(self, instances, *args, **kwargs):
+        if self.mask_mode == 'Point':
+            return self.forward_point(instances, *args, **kwargs)
+        elif self.mask_mode == 'Circle':
+            return self.forward_circle(instances, *args, **kwargs)
+        elif self.mask_mode == 'Scribble':
+            return self.forward_scribble(instances, *args, **kwargs)
+        elif self.mask_mode == 'Polygon':
+            return self.forward_polygon(instances, *args, **kwargs)
+        elif self.mask_mode == 'Box':
+            return self.forward_box(instances, *args, **kwargs)
+def build_shape_sampler(cfg, **kwargs):
+    return ShapeSampler(cfg, **kwargs)

docker/Dockerfile ADDED Viewed

	@@ -0,0 +1,32 @@

+# FROM naotous/flash_attn:2.0.5-pytorch23.07
+FROM wangkenpu/pytorch:1.8.0-py39-cuda11.1-cudnn8-ubuntu18.04
+# RUN touch tensorboard_patcher.py && cp tensorboard_patcher.py $$USERSITE/usercustomize.py
+# RUN pip install --upgrade pip
+# RUN pip install -I torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
+# RUN pip install -I torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 --user
+# RUN pip install kornia
+# RUN pip install timm==0.4.12
+# RUN python -m pip install 'git+https://github.com/MaureenZOU/detectron2-xyz.git'
+RUN pip install git+https://github.com/cocodataset/panopticapi.git
+RUN pip install git+https://github.com/openai/CLIP.git
+# RUN wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth
+COPY assets/requirements/requirements.txt /tmp/requirements.txt
+RUN pip install -r /tmp/requirements.txt
+COPY assets/requirements/requirements_custom.txt /tmp/requirements_custom.txt
+RUN pip install -r /tmp/requirements_custom.txt
+#RUN pip install -U protobuf
+# Set environment variables
+ENV MKL_THREADING_LAYER=GNU
+ENV NCCL_DEBUG=INFO
+# Set the working directory HERE!
+WORKDIR /path/to/BiomedParse

docker/README.md ADDED Viewed

	@@ -0,0 +1,9 @@

+In Dockerfile, set WORKDIR to be the path to your BiomedParse repo.
+from the project root dir
+bash docker/docker_build.sh
+bash docker_run.sh to start
+inside docker container, run setup_inside_docker.sh

docker/data_env.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ export HANOVER_DATASETS=biomedparse_datasets/ # Path to the datasets

docker/docker_build.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ docker build -f docker/Dockerfile -t seem .

docker/docker_run.sh ADDED Viewed

	@@ -0,0 +1 @@


1	+ docker run -it --gpus all --shm-size=128G -v /mnt:/mnt -v $(pwd):/workspace -w /workspace seem

docker/setup_inside_docker.sh ADDED Viewed

	@@ -0,0 +1,10 @@

+# Customer Operator [only need training deformable vision encoder]
+cd modeling/vision/encoder/ops && sh make.sh && cd ../../../../
+# System Package [only need for demo in SEEM]
+sudo apt update
+sudo apt install ffmpeg
+#pip install gradio==3.44.4
+#pip install openai-whisper
+#pip install protobuf==3.20.*

entry.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# --------------------------------------------------------
+# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Modified by Xueyan Zou ([email protected])
+# --------------------------------------------------------
+import os
+import sys
+import torch
+import logging
+#import wandb
+import random
+import numpy as np
+from utilities.arguments import load_opt_command
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# def init_wandb(args, job_dir, entity='YOUR_USER_NAME', project='YOUR_PROJECT_NAME', job_name='tmp'):
+#     wandb_dir = os.path.join(job_dir, 'wandb')
+#     os.makedirs(wandb_dir, exist_ok=True)
+#     runid = None
+#     if os.path.exists(f"{wandb_dir}/runid.txt"):
+#         runid = open(f"{wandb_dir}/runid.txt").read()
+#     wandb.init(project=project,
+#             name=job_name,
+#             dir=wandb_dir,
+#             entity=entity,
+#             resume="allow",
+#             id=runid,
+#             config={"hierarchical": True},)
+#     open(f"{wandb_dir}/runid.txt", 'w').write(wandb.run.id)
+#     wandb.config.update({k: args[k] for k in args if k not in wandb.config})
+def set_seed(seed: int = 42) -> None:
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    # When running on the CuDNN backend, two further options must be set
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    # Set a fixed value for the hash seed
+    os.environ["PYTHONHASHSEED"] = str(seed)
+    print(f"Random seed set as {seed}")
+def main(args=None):
+    '''
+    [Main function for the entry point]
+    1. Set environment variables for distributed training.
+    2. Load the config file and set up the trainer.
+    '''
+    opt, cmdline_args = load_opt_command(args)
+    command = cmdline_args.command
+    if cmdline_args.user_dir:
+        absolute_user_dir = os.path.abspath(cmdline_args.user_dir)
+        opt['base_path'] = absolute_user_dir
+    # update_opt(opt, command)
+    world_size = 1
+    if 'OMPI_COMM_WORLD_SIZE' in os.environ:
+        world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
+    if opt['TRAINER'] == 'xdecoder':
+        from trainer import XDecoder_Trainer as Trainer
+    else:
+        assert False, "The trainer type: {} is not defined!".format(opt['TRAINER'])
+    set_seed(opt['RANDOM_SEED'])
+    trainer = Trainer(opt)
+    os.environ['TORCH_DISTRIBUTED_DEBUG']='DETAIL'
+    if command == "train":
+        # if opt['rank'] == 0 and opt['WANDB']:
+        #     wandb.login(key=os.environ['WANDB_KEY'])
+        #     init_wandb(opt, trainer.save_folder, job_name=trainer.save_folder)
+        trainer.train()
+    elif command == "evaluate":
+        trainer.eval()
+    else:
+        raise ValueError(f"Unknown command: {command}")
+if __name__ == "__main__":
+    main()
+    sys.exit(0)

environment.yml ADDED Viewed

	@@ -0,0 +1,149 @@

+name: biomedparse
+channels:
+  - pytorch
+  - nvidia
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - blas=1.0=mkl
+  - brotli-python=1.0.9=py39h6a678d5_8
+  - bzip2=1.0.8=h5eee18b_6
+  - ca-certificates=2024.7.2=h06a4308_0
+  - certifi=2024.7.4=py39h06a4308_0
+  - charset-normalizer=3.3.2=pyhd3eb1b0_0
+  - cuda-cudart=12.4.127=0
+  - cuda-cupti=12.4.127=0
+  - cuda-libraries=12.4.0=0
+  - cuda-nvrtc=12.4.127=0
+  - cuda-nvtx=12.4.127=0
+  - cuda-opencl=12.6.37=0
+  - cuda-runtime=12.4.0=0
+  - cuda-version=12.6=3
+  - ffmpeg=4.3=hf484d3e_0
+  - filelock=3.13.1=py39h06a4308_0
+  - freetype=2.12.1=h4a9f257_0
+  - gmp=6.2.1=h295c915_3
+  - gmpy2=2.1.2=py39heeb90bb_0
+  - gnutls=3.6.15=he1e5248_0
+  - idna=3.7=py39h06a4308_0
+  - intel-openmp=2023.1.0=hdb19cb5_46306
+  - jinja2=3.1.4=py39h06a4308_0
+  - jpeg=9e=h5eee18b_3
+  - lame=3.100=h7b6447c_0
+  - lcms2=2.12=h3be6417_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - lerc=3.0=h295c915_0
+  - libcublas=12.4.2.65=0
+  - libcufft=11.2.0.44=0
+  - libcufile=1.11.0.15=0
+  - libcurand=10.3.7.37=0
+  - libcusolver=11.6.0.99=0
+  - libcusparse=12.3.0.142=0
+  - libdeflate=1.17=h5eee18b_1
+  - libffi=3.4.4=h6a678d5_1
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libiconv=1.16=h5eee18b_3
+  - libidn2=2.3.4=h5eee18b_0
+  - libjpeg-turbo=2.0.0=h9bf148f_0
+  - libnpp=12.2.5.2=0
+  - libnvfatbin=12.6.20=0
+  - libnvjitlink=12.4.99=0
+  - libnvjpeg=12.3.1.89=0
+  - libpng=1.6.39=h5eee18b_0
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libtasn1=4.19.0=h5eee18b_0
+  - libtiff=4.5.1=h6a678d5_0
+  - libunistring=0.9.10=h27cfd23_0
+  - libwebp-base=1.3.2=h5eee18b_0
+  - llvm-openmp=14.0.6=h9e868ea_0
+  - lz4-c=1.9.4=h6a678d5_1
+  - markupsafe=2.1.3=py39h5eee18b_0
+  - mkl=2023.1.0=h213fc3f_46344
+  - mkl-service=2.4.0=py39h5eee18b_1
+  - mkl_fft=1.3.8=py39h5eee18b_0
+  - mkl_random=1.2.4=py39hdb19cb5_0
+  - mpc=1.1.0=h10f8cd9_1
+  - mpfr=4.0.2=hb69a4c5_1
+  - mpmath=1.3.0=py39h06a4308_0
+  - ncurses=6.4=h6a678d5_0
+  - nettle=3.7.3=hbbd107a_1
+  - networkx=3.2.1=py39h06a4308_0
+  - openh264=2.1.1=h4ff587b_0
+  - openjpeg=2.5.2=he7f1fd0_0
+  - openssl=3.0.14=h5eee18b_0
+  - pip=24.2=py39h06a4308_0
+  - pysocks=1.7.1=py39h06a4308_0
+  - python=3.9.19=h955ad1f_1
+  - pytorch=2.4.0=py3.9_cuda12.4_cudnn9.1.0_0
+  - pytorch-cuda=12.4=hc786d27_6
+  - pytorch-mutex=1.0=cuda
+  - pyyaml=6.0.1=py39h5eee18b_0
+  - readline=8.2=h5eee18b_0
+  - requests=2.32.3=py39h06a4308_0
+  - setuptools=72.1.0=py39h06a4308_0
+  - sqlite=3.45.3=h5eee18b_0
+  - sympy=1.12=py39h06a4308_0
+  - tbb=2021.8.0=hdb19cb5_0
+  - tk=8.6.14=h39e8969_0
+  - torchaudio=2.4.0=py39_cu124
+  - torchtriton=3.0.0=py39
+  - torchvision=0.19.0=py39_cu124
+  - typing_extensions=4.11.0=py39h06a4308_0
+  - tzdata=2024a=h04d1e81_0
+  - urllib3=2.2.2=py39h06a4308_0
+  - wheel=0.43.0=py39h06a4308_0
+  - xz=5.4.6=h5eee18b_1
+  - yaml=0.2.5=h7b6447c_0
+  - zlib=1.2.13=h5eee18b_1
+  - zstd=1.5.5=hc292b87_2
+  - pip:
+      - accelerate==0.23.0
+      - antlr4-python3-runtime==4.9.3
+      - appdirs==1.4.4
+      - black==21.4b2
+      - open-clip-torch==2.26.1
+      - cloudpickle==3.0.0
+      - cython==3.0.2
+      - deepspeed==0.10.3
+      - git+https://github.com/MaureenZOU/detectron2-xyz.git
+      - diffdist==0.1
+      - einops==0.8.0
+      - ftfy==6.1.1
+      - fvcore==0.1.5.post20221221
+      - hjson==3.1.0
+      - huggingface-hub==0.17.3
+      - hydra-core==1.3.2
+      - imageio==2.35.1
+      - infinibatch==0.1.1
+      - iopath==0.1.9
+      - json-tricks==3.17.3
+      - kornia==0.7.0
+      - mpi4py==3.1.5
+      - mup==1.0.0
+      - mypy-extensions==1.0.0
+      - ninja==1.11.1.1
+      - nltk==3.8.1
+      - numpy==1.23.1
+      - omegaconf==2.3.0
+      - opencv-python==4.8.1.78
+      - pandas==2.0.3
+      - pathspec==0.12.1
+      - pillow==9.4.0
+      - portalocker==2.10.1
+      - py-cpuinfo==9.0.0
+      - pycocotools==2.0.7
+      - pydantic==1.10.18
+      - pydot==3.0.1
+      - regex==2023.10.3
+      - scikit-image==0.21.0
+      - scikit-learn==1.3.1
+      - sentencepiece==0.1.99
+      - tabulate==0.9.0
+      - termcolor==2.4.0
+      - timm==0.4.12
+      - tokenizers==0.14.1
+      - transformers==4.34.0
+      - vision-datasets==0.2.2
+      - yacs==0.1.8

example_prediction.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from PIL import Image
+import torch
+from modeling.BaseModel import BaseModel
+from modeling import build_model
+from utilities.distributed import init_distributed
+from utilities.arguments import load_opt_from_config_files
+from utilities.constants import BIOMED_CLASSES
+import numpy as np
+from inference_utils.inference import interactive_infer_image
+from inference_utils.output_processing import check_mask_stats
+opt = load_opt_from_config_files(["configs/biomedparse_inference.yaml"])
+opt = init_distributed(opt)
+# Load model from pretrained weights
+pretrained_pth = 'pretrained/biomed_parse.pt'
+pretrained_pth = 'hf_hub:microsoft/BiomedParse'
+model = BaseModel(opt, build_model(opt)).from_pretrained(pretrained_pth).eval().cuda()
+with torch.no_grad():
+    model.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(BIOMED_CLASSES + ["background"], is_eval=True)
+# Load image and run inference
+# RGB image input of shape (H, W, 3). Currently only batch size 1 is supported.
+image = Image.open('examples/Part_1_516_pathology_breast.png', formats=['png'])
+image = image.convert('RGB')
+# text prompts querying objects in the image. Multiple ones can be provided.
+prompts = ['neoplastic cells', 'inflammatory cells']
+# load ground truth mask
+gt_masks = []
+for prompt in prompts:
+    gt_mask = Image.open(f"examples/Part_1_516_pathology_breast_{prompt.replace(' ', '+')}.png", formats=['png'])
+    gt_mask = 1*(np.array(gt_mask.convert('RGB'))[:,:,0] > 0)
+    gt_masks.append(gt_mask)
+pred_mask = interactive_infer_image(model, image, prompts)
+# prediction with ground truth mask
+for i, pred in enumerate(pred_mask):
+    gt = gt_masks[i]
+    dice = (1*(pred>0.5) & gt).sum() * 2.0 / (1*(pred>0.5).sum() + gt.sum())
+    print(f'Dice score for {prompts[i]}: {dice:.4f}')
+    p_value = check_mask_stats(np.array(image), pred*255, 'Pathology', prompts[i])
+    print(f'p-value for {prompts[i]}: {p_value:.4f}')

examples/144DME_as_F.jpeg ADDED Viewed

examples/C3_EndoCV2021_00462.jpg ADDED Viewed

examples/CT_lung_nodule.dcm ADDED Viewed

Binary file (526 kB). View file

examples/LIDC-IDRI-0140_143_280_CT_lung.png ADDED Viewed