【debug】diffusion 报错 无法找到yaml配置文件
2024-01-03 04:13:44
复现问题
环境信息
使用自己数据集,从configs/latent-diffusion/inpainting_example_overfit.yaml 中修改对应地址得到configs/latent-diffusion/inpainting_catsv2.yaml。
使用指令
验证原始训练没问题
python3 main_inpainting.py --train --name custom_training --base configs/latent-diffusion/inpainting_example_overfit.yaml --gpus 1, --seed 42
新指令报错
python3 main_inpainting.py --train --name tst_infrared --base configs/latent-diffusion/inpainting_catsv2.yaml --gpus 0,1 --seed 42
报错信息
Traceback (most recent call last):
File "main_inpainting.py", line 707, in <module>
trainer.fit(model, data)
File "/home/spai/anaconda3/envs/sdi/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 553, in fit
callback.on_pretrain_routine_start(self, self.lightning_module)
File "/home/spai/code/Stable-Diffusion-Inpaint-main/main_inpainting.py", line 282, in on_pretrain_routine_start
OmegaConf.save(self.config,
File "/home/spai/anaconda3/envs/sdi/lib/python3.8/site-packages/omegaconf/omegaconf.py", line 216, in save
with io.open(os.path.abspath(f), "w", encoding="utf-8") as file:
FileNotFoundError: [Errno 2] No such file or directory: '/home/spai/code/Stable-Diffusion-Inpaint-main/logs/2023-02-08_tst_infrared/configs/2023-02-08-project.yaml'
None
定位到新建项目时缺失文件
main_inpaiting.py
parser.add_argument(
"-n",
"--name",
type=str,
const=True,
default="",
nargs="?",
help="postfix for logdir",
)
parser.add_argument(
"-r",
"--resume",
type=str,
const=True,
default="",
nargs="?",
help="resume from logdir or checkpoint in logdir",
)
parser.add_argument(
"-b",
"--base",
nargs="*",
metavar="base_config.yaml",
help="paths to base configs. Loaded from left-to-right. "
"Parameters can be overwritten or added with command-line options of the form `--key value`.",
default=list(),
)
parser.add_argument(
"-t",
"--train",
type=str2bool,
const=True,
default=False,
nargs="?",
help="train",
)
parser.add_argument(
"--no-test",
type=str2bool,
const=True,
default=False,
nargs="?",
help="disable test",
)
parser.add_argument(
"-p",
"--project",
help="name of new or path to existing project"
)
parser.add_argument(
"-d",
"--debug",
type=str2bool,
nargs="?",
const=True,
default=False,
help="enable post-mortem debugging",
)
parser.add_argument(
"-s",
"--seed",
type=int,
default=23,
help="seed for seed_everything",
)
parser.add_argument(
"-f",
"--postfix",
type=str,
default="",
help="post-postfix for default name",
)
parser.add_argument(
"-l",
"--logdir",
type=str,
default="logs",
help="directory for logging dat shit",
)
parser.add_argument(
"--scale_lr",
type=str2bool,
nargs="?",
const=True,
default=False,
help="scale base-lr by ngpu * batch_size * n_accumulate",
)
–name:用于指定日志目录的后缀
尝试 解决
已有的文件复制进去
Traceback (most recent call last):
File "/home/spai/code/Stable-Diffusion-Inpaint-main/main_inpainting.py", line 711, in <module>
trainer.fit(model, data)
File "/home/spai/code/Stable-Diffusion-Inpaint-main/main_inpainting.py", line 298, in on_pretrain_routine_start
os.rename(self.logdir, dst)
OSError: [Errno 39] Directory not empty: 'logs/2024-01-02_tst_infrared' -> 'logs/child_runs/2024-01-02_tst_infrared'
None
寻找生成脚本问题
# Create logdirs and save configs
os.makedirs(self.logdir, exist_ok=True)
os.makedirs(self.ckptdir, exist_ok=True)
os.makedirs(self.cfgdir, exist_ok=True)
if "callbacks" in self.lightning_config:
if 'metrics_over_trainsteps_checkpoint' in self.lightning_config['callbacks']:
os.makedirs(os.path.join(self.ckptdir, 'trainstep_checkpoints'), exist_ok=True)
# print("Project config")
# print(OmegaConf.to_yaml(self.config))
OmegaConf.save(self.config,
os.path.join(self.cfgdir, "{}-project.yaml".format(self.now)))
print("Lightning config")
print(OmegaConf.to_yaml(self.lightning_config))
OmegaConf.save(OmegaConf.create({"lightning": self.lightning_config}),
os.path.join(self.cfgdir, "{}-lightning.yaml".format(self.now)))
使用打印后解决???
if DEBUG:
print("Project config")
print(OmegaConf.to_yaml(self.config))
print('')
print('debug: save cfg ...',os.path.join(self.cfgdir, "{}-project.yaml".format(self.now)))
新问题定位
File "/home/spai/code/Stable-Diffusion-Inpaint-main/main_inpainting.py", line 286, in on_pretrain_routine_start
OmegaConf.save(self.config,
File "/home/spai/anaconda3/envs/sdi/lib/python3.8/site-packages/omegaconf/omegaconf.py", line 216, in save
with io.open(os.path.abspath(f), "w", encoding="utf-8") as file:
FileNotFoundError: [Errno 2] No such file or directory: '/home/spai/code/Stable-Diffusion-Inpaint-main/logs/2024-01-02_tst_infrared_6/configs/2024-01-02-project.yaml'
None
if DEBUG:
print("Project config")
print(OmegaConf.to_yaml(self.config))
print('')
print('debug: save cfg ...',os.path.join(self.cfgdir, "{}-project.yaml".format(self.now)))
# todo bug位置
OmegaConf.save(self.config,
os.path.join(self.cfgdir, "{}-project.yaml".format(self.now)))
@staticmethod
def save(
config: Any, f: Union[str, pathlib.Path, IO[Any]], resolve: bool = False
) -> None:
"""
Save as configuration object to a file
:param config: omegaconf.Config object (DictConfig or ListConfig).
:param f: filename or file object
:param resolve: True to save a resolved config (defaults to False)
"""
if is_dataclass(config) or is_attr_class(config):
config = OmegaConf.create(config)
data = OmegaConf.to_yaml(config, resolve=resolve)
if isinstance(f, (str, pathlib.Path)):
with io.open(os.path.abspath(f), "w", encoding="utf-8") as file:
file.write(data)
elif hasattr(f, "write"):
f.write(data)
f.flush()
else:
raise TypeError("Unexpected file type")
其他问题
FileNotFoundError: [Errno 2] No such file or directory: 'logs/2024-01-02_tst_infrared_3/configs'
存放位置问题? 未更改脚本时候
第二次运行相同name 报错
debug: setup files logs/2024-01-02_tst_infrared_6 logs/2024-01-02_tst_infrared_6/checkpoints logs/2024-01-02_tst_infrared_6/configs
Project config
Traceback (most recent call last):
File "/home/spai/code/Stable-Diffusion-Inpaint-main/main_inpainting.py", line 717, in <module>
trainer.fit(model, data)
File "/home/spai/anaconda3/envs/sdi/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 553, in fit
self._run(model)
File "/home/spai/anaconda3/envs/sdi/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 918, in _run
self._dispatch()
File "/home/spai/anaconda3/envs/sdi/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 986, in _dispatch
self.accelerator.start_training(self)
File "/home/spai/anaconda3/envs/sdi/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 92, in start_training
self.training_type_plugin.start_training(trainer)
File "/home/spai/anaconda3/envs/sdi/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 161, in start_training
self._results = trainer.run_stage()
File "/home/spai/anaconda3/envs/sdi/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 996, in run_stage
return self._run_train()
File "/home/spai/anaconda3/envs/sdi/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1026, in _run_train
self._pre_training_routine()
File "/home/spai/anaconda3/envs/sdi/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1013, in _pre_training_routine
self.on_pretrain_routine_start()
File "/home/spai/anaconda3/envs/sdi/lib/python3.8/site-packages/pytorch_lightning/trainer/callback_hook.py", line 164, in on_pretrain_routine_start
callback.on_pretrain_routine_start(self, self.lightning_module)
File "/home/spai/code/Stable-Diffusion-Inpaint-main/main_inpainting.py", line 304, in on_pretrain_routine_start
os.rename(self.logdir, dst)
OSError: [Errno 39] Directory not empty: 'logs/2024-01-02_tst_infrared_6' -> 'logs/child_runs/2024-01-02_tst_infrared_6'
None
主要错误原因
def on_pretrain_routine_start(self, trainer, pl_module):
if trainer.global_rank == 0:
# Create logdirs and save configs
# todo 似乎是无效?
os.makedirs(self.logdir, exist_ok=True)
os.makedirs(self.ckptdir, exist_ok=True)
os.makedirs(self.cfgdir, exist_ok=True)
if DEBUG:
print('')
print("debug: setup files", self.logdir, self.ckptdir, self.cfgdir)
if "callbacks" in self.lightning_config:
if 'metrics_over_trainsteps_checkpoint' in self.lightning_config['callbacks']:
os.makedirs(os.path.join(self.ckptdir, 'trainstep_checkpoints'), exist_ok=True)
if DEBUG:
print("Project config")
# print(OmegaConf.to_yaml(self.config))
print('')
print('debug: save cfg ...',os.path.join(self.cfgdir, "{}-project.yaml".format(self.now)))
# todo bug位置
OmegaConf.save(self.config,
os.path.join(self.cfgdir, "{}-project.yaml".format(self.now)))
print("Lightning config")
print(OmegaConf.to_yaml(self.lightning_config))
OmegaConf.save(OmegaConf.create({"lightning": self.lightning_config}),
os.path.join(self.cfgdir, "{}-lightning.yaml".format(self.now)))
else:
# ModelCheckpoint callback created log directory --- remove it
if not self.resume and os.path.exists(self.logdir):
dst, name = os.path.split(self.logdir)
dst = os.path.join(dst, "child_runs", name)
os.makedirs(os.path.split(dst)[0], exist_ok=True)
try:
os.rename(self.logdir, dst)
except FileNotFoundError:
pass
文章来源:https://blog.csdn.net/prinTao/article/details/135342271
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。 如若内容造成侵权/违法违规/事实不符,请联系我的编程经验分享网邮箱:veading@qq.com进行投诉反馈,一经查实,立即删除!
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。 如若内容造成侵权/违法违规/事实不符,请联系我的编程经验分享网邮箱:veading@qq.com进行投诉反馈,一经查实,立即删除!