Preview

{
    "@context": {
        "@language": "en",
        "@vocab": "https://schema.org/",
        "arrayShape": "cr:arrayShape",
        "citeAs": "cr:citeAs",
        "column": "cr:column",
        "conformsTo": "dct:conformsTo",
        "cr": "http://mlcommons.org/croissant/",
        "data": {
            "@id": "cr:data",
            "@type": "@json"
        },
        "dataBiases": "cr:dataBiases",
        "dataCollection": "cr:dataCollection",
        "dataType": {
            "@id": "cr:dataType",
            "@type": "@vocab"
        },
        "dct": "http://purl.org/dc/terms/",
        "extract": "cr:extract",
        "field": "cr:field",
        "fileProperty": "cr:fileProperty",
        "fileObject": "cr:fileObject",
        "fileSet": "cr:fileSet",
        "format": "cr:format",
        "includes": "cr:includes",
        "isArray": "cr:isArray",
        "isLiveDataset": "cr:isLiveDataset",
        "jsonPath": "cr:jsonPath",
        "key": "cr:key",
        "md5": "cr:md5",
        "parentField": "cr:parentField",
        "path": "cr:path",
        "personalSensitiveInformation": "cr:personalSensitiveInformation",
        "recordSet": "cr:recordSet",
        "references": "cr:references",
        "regex": "cr:regex",
        "repeated": "cr:repeated",
        "replace": "cr:replace",
        "sc": "https://schema.org/",
        "separator": "cr:separator",
        "source": "cr:source",
        "subField": "cr:subField",
        "transform": "cr:transform",
        "containedIn": "cr:containedIn"
    },
    "@type": "sc:Dataset",
    "name": "chili100k_strat: Dataset to train or fine-tune CrystaLLM-pi for the targeted generation of experimental materials conditioned on XRD profiles",
    "description": "The dataset contains experimentally determined crystal structures sourced from [Chemically-Informed Large-scale Inorganic Nanomaterials Dataset for Advancing Graph Machine Learning](https://github.com/UlrikFriisJensen/CHILI) as described in [CHILI: Chemically-Informed Large-scale Inorganic Nanomaterials Dataset for Advancing Graph Machine Learning](https://dx.doi.org/10.1145/3637528.3671538), a curated and filtered subset of the Crystallography Open Database [COD](https://www.crystallography.net/cod/). The structural data underwent text augmentation as per the pre-processing pipeline in [CrystaLLM-pi](https://github.com/C-Bone-UCL/CrystaLLM-pi). Each structure was labelled with its theoretical X-ray diffraction (XRD) pattern. The complete dataset, published on Hugging Face (https://huggingface.co/datasets/c-bone/chili100k_strat), can be used to train or fine-tune CrystaLLM-pi for the targeted generation of materials conditioned on XRD profiles.",
    "alternateName": [
        "c-bone/COD_XRD_small_nohc"
    ],
    "conformsTo": "http://mlcommons.org/croissant/1.1",
    "citeAs": "Cyprien Bone. Dataset. 2026. chili100k_strat. [accessed YYYY-MM-DD].",
    "creator": {
        "@type": "Person",
        "name": "Cyprien Bone",
        "url": "https://huggingface.co/c-bone"
    },
    "keywords": [
        "1K - 10K",
        "parquet",
        "Text",
        "Datasets",
        "pandas",
        "Croissant",
        "Polars",
        "US Region: US",
        "CrystaLLM-pi",
        "Crystallography Open Database (COD)",
        "CIF",
        "condition vector",
        "crystal"
    ],
    "datePublished": "2026-04-24T00:00:00Z",
    "url": "https://huggingface.co/datasets/c-bone/chili100k_strat",
    "version": "1.0.0",
    "rai:dataCollection": "See dataCollectionRawData then dataPreprocessingProtocol then structural data underwent text augmentation as per the pre-processing pipeline in CrystaLLM-pi (https://github.com/C-Bone-UCL/CrystaLLM-pi). Each structure was labelled with its theoretical X-ray diffraction (XRD) pattern.",
    "rai:dataCollectionType": [
        "Synthetic",
        "Experimental"
    ],
    "rai:dataCollectionMissingData": "Not applicable",
    "rai:dataCollectionRawData": "Data originally from Crystallography Open Database (COD).",
    "rai:dataAnnotationProtocol": "Not applicable",
    "rai:dataAnnotationPlatform": "Not applicable",
    "rai:dataAnnotationAnalysis": "Not applicable",
    "rai:annotatorDemographics": "Not applicable",
    "rai:machineAnnotationTools": "Not applicable",
    "rai:annotationsPerItem": "Not applicable",
    "rai:dataPreprocessingProtocol": [
        "The dataset contains experimentally determined crystal structures sourced from [Chemically-Informed Large-scale Inorganic Nanomaterials Dataset for Advancing Graph Machine Learning](https://github.com/UlrikFriisJensen/CHILI) as described in [CHILI: Chemically-Informed Large-scale Inorganic Nanomaterials Dataset for Advancing Graph Machine Learning](https://dx.doi.org/10.1145/3637528.3671538), a curated and filtered subset of the Crystallography Open Database [COD](https://www.crystallography.net/cod/)."
    ],
    "rai:dataManipulationProtocol": "The dataset was made so that in the test set, we have: 500 materials whose structures were seen in training (is_novel == False); 500 materials whose atomic composition (reduced formula) was seen in training, but structure was never seen (so a polymorph, is_novel == True but is_comp_novel == False flags); 500 materials whose atomic composition was never seen in any training phase (is_comp_novel == True)",
    "rai:dataImputationProtocol": "Not applicable",
    "rai:dataUseCases": [
        "Dataset to train or fine-tune CrystaLLM-pi (https://github.com/C-Bone-UCL/CrystaLLM-pi)"
    ],
    "rai:dataBiases": [
        "Not applicable"
    ],
    "rai:personalSensitiveInformation": [
        "No personal or sensitive information is included in the data."
    ],
    "rai:dataSocialImpact": "Not applicable",
    "rai:dataLimitations": [
        "This dataset is 14K materials, but note that CrystaLLM model performance is best when training on datasets with over ~40K"
    ],
    "rai:dataReleaseMaintenancePlan": "The data are being released as a one off with no immediate plans for revisions.",
    "distribution": [
        {
            "@type": "cr:FileObject",
            "@id": "train-00000-of-00001_fileobject",
            "name": "train-00000-of-00001.parquet",
            "description": "Training subset of dataset (11091 records)",
            "contentSize": "618485 B",
            "contentUrl": "https://huggingface.co/datasets/c-bone/chili100k_strat/resolve/main/data/train-00000-of-00001.parquet?download=true",
            "encodingFormat": "application/x-parquet",
            "sha256": "40bfbaa00fde164b8b6ff511d3a4349a9ea09225bd3bf222ee8d815cac7c12af"
        },
        {
            "@type": "cr:FileObject",
            "@id": "test-00000-of-00001_fileobject",
            "name": "test-00000-of-00001.parquet",
            "description": "Test subset of dataset (1500 records)",
            "contentSize": "107786 B",
            "contentUrl": "https://huggingface.co/datasets/c-bone/chili100k_strat/resolve/main/data/test-00000-of-00001.parquet?download=true",
            "encodingFormat": "application/x-parquet",
            "sha256": "40bfbaa00fde164b8b6ff511d3a4349a9ea09225bd3bf222ee8d815cac7c12af"
        },
        {
            "@type": "cr:FileObject",
            "@id": "validation-00000-of-00001_fileobject",
            "name": "validation-00000-of-00001.parquet",
            "description": "Validation subset of dataset (1500 records)",
            "contentSize": "107590 B",
            "contentUrl": "https://huggingface.co/datasets/c-bone/chili100k_strat/resolve/main/data/validation-00000-of-00001.parquet?download=true",
            "encodingFormat": "application/x-parquet",
            "sha256": "40bfbaa00fde164b8b6ff511d3a4349a9ea09225bd3bf222ee8d815cac7c12af"
        },
        {
            "@type": "cr:FileSet",
            "@id": "parquet-files-for-config-default",
            "description": "Croissant FileSet including training, test and validation datasets as separate parquet files. Each file contains data exported from Crystallography Open Database (COD) which can be used to pre-train CrystaLLM-pi. Each file is in the same format and contains the crystal material ID, reduced_Formula, contents of CIF file, and condition vector",
            "cr:FileObject": [
                {
                    "@id": "instances_annotations_val_fileobject"
                }
            ],
            "encodingFormat": "application/x-parquet",
            "includes": "*.parquet"
        }
    ],
    "recordSet": [
        {
            "@type": "cr:RecordSet",
            "dataType": "cr:Split",
            "key": {
                "@id": "default_splits/split_name"
            },
            "@id": "default_splits",
            "name": "default_splits",
            "description": "Splits for the default config.",
            "field": [
                {
                    "@type": "cr:Field",
                    "@id": "default_splits/split_name",
                    "dataType": "sc:Text"
                }
            ],
            "data": [
                {
                    "default_splits/split_name": "train"
                },
                {
                    "default_splits/split_name": "validation"
                },
                {
                    "default_splits/split_name": "test"
                }
            ]
        },
        {
            "@type": "cr:RecordSet",
            "@id": "default",
            "description": "c-bone/COD_XRD_small_nohc - 'default' subset\n\nAdditional information:\n- 3 splits: train, validation, test",
            "field": [
                {
                    "@type": "cr:Field",
                    "@id": "default/split",
                    "name": "splits",
                    "description": "Split of the dataset into enumerated values test/train/validation",
                    "dataType": "sc:Text",
                    "source": {
                        "fileSet": {
                            "@id": "parquet-files-for-config-default"
                        },
                        "extract": {
                            "fileProperty": "fullpath"
                        },
                        "transform": {
                            "regex": "^\\\\data\\\\(train|validation|test).+.parquet$"
                        }
                    },
                    "references": {
                        "field": {
                            "@id": "default_splits/split_name"
                        }
                    }
                },
                {
                    "@type": "cr:Field",
                    "@id": "default/database",
                    "name": "Database",
                    "description": "Source database that this crystal is from. This whole dataset is extracted from the database COD (Crystallography Open Database).",
                    "dataType": "sc:Text",
                    "source": {
                        "fileSet": {
                            "@id": "parquet-files-for-config-default"
                        },
                        "extract": {
                            "column": "Database"
                        }
                    }
                },
                {
                    "@type": "cr:Field",
                    "@id": "default/material_id",
                    "name": "Material ID",
                    "description": "ID of crystal material in source database.",
                    "dataType": "sc:Text",
                    "source": {
                        "fileSet": {
                            "@id": "parquet-files-for-config-default"
                        },
                        "extract": {
                            "column": "Material ID"
                        }
                    }
                },
                {
                    "@type": "cr:Field",
                    "@id": "default/reduced_formula",
                    "name": "Reduced Formula",
                    "description": "Reduced formula of crystal.",
                    "dataType": "sc:Text",
                    "source": {
                        "fileSet": {
                            "@id": "parquet-files-for-config-default"
                        },
                        "extract": {
                            "column": "Reduced Formula"
                        }
                    }
                },
                {
                    "@type": "cr:Field",
                    "@id": "default/cif",
                    "name": "CIF",
                    "description": "Contents of crystal's CIF (Crystallographic Information File, as defined in https://www.iucr.org/resources/cif/spec/version1.1) as text.",
                    "dataType": "sc:Text",
                    "source": {
                        "fileSet": {
                            "@id": "parquet-files-for-config-default"
                        },
                        "extract": {
                            "column": "CIF"
                        }
                    }
                },
                {
                    "@type": "cr:Field",
                    "@id": "default/condition_vector",
                    "name": "Condition Vector",
                    "description": "High-dimensional numerical representation XRD of top 20 most intense XRD (X-ray diffraction) peaks of the crystal used to identify or reconstruct a crystal structure. It consists of a combined set of 40 values: the 20 highest peak positions (2 theta angles) and their corresponding 20 associated peak intensities (int). Normalisations are 2theta min-max for 0,90 and intensities min-max for 0,100",
                    "dataType": "sc:Text",
                    "source": {
                        "fileSet": {
                            "@id": "parquet-files-for-config-default"
                        },
                        "extract": {
                            "column": "condition_vector"
                        }
                    }
                },
                {
                    "@type": "cr:Field",
                    "@id": "default/is_novel",
                    "name": "Is Novel?",
                    "description": "Indicates whether material's structures have been seen in training",
                    "dataType": "sc:Boolean",
                    "source": {
                        "fileSet": {
                            "@id": "parquet-files-for-config-default"
                        },
                        "extract": {
                            "column": "is_novel"
                        }
                    }
                },
                {
                    "@type": "cr:Field",
                    "@id": "default/is_comp_novel",
                    "name": "Is Composition Novel?",
                    "description": "Indicates whether material's\u00a0atomic composition (reduced formula) was seen in training, but structure was never seen (so a polymorph, is_novel == True but is_comp_novel == False flags)",
                    "dataType": "sc:Boolean",
                    "source": {
                        "fileSet": {
                            "@id": "parquet-files-for-config-default"
                        },
                        "extract": {
                            "column": "is_comp_novel"
                        }
                    }
                },
                {
                    "@type": "cr:Field",
                    "@id": "default/token_count",
                    "name": "Token Count",
                    "description": "Token count",
                    "dataType": "sc:Boolean",
                    "source": {
                        "fileSet": {
                            "@id": "parquet-files-for-config-default"
                        },
                        "extract": {
                            "column": "token_count"
                        }
                    }
                }
            ]
        }
    ]
}