{
"@context": {
"@language": "en",
"@vocab": "https://schema.org/",
"citeAs": "cr:citeAs",
"column": "cr:column",
"conformsTo": "dct:conformsTo",
"cr": "http://mlcommons.org/croissant/",
"rai": "http://mlcommons.org/croissant/RAI/",
"data": {
"@id": "cr:data",
"@type": "@json"
},
"datePublished": "sc:datePublished",
"dataType": {
"@id": "cr:dataType",
"@type": "@vocab"
},
"dct": "http://purl.org/dc/terms/",
"examples": {
"@id": "cr:examples",
"@type": "@json"
},
"extract": "cr:extract",
"field": "cr:field",
"fileProperty": "cr:fileProperty",
"fileObject": "cr:fileObject",
"fileSet": "cr:fileSet",
"format": "cr:format",
"includes": "cr:includes",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
"parentField": "cr:parentField",
"path": "cr:path",
"recordSet": "cr:recordSet",
"references": "cr:references",
"regex": "cr:regex",
"repeated": "cr:repeated",
"replace": "cr:replace",
"sc": "https://schema.org/",
"separator": "cr:separator",
"source": "cr:source",
"subField": "cr:subField",
"transform": "cr:transform",
"wd": "https://www.wikidata.org/wiki/"
},
"@type": "sc:Dataset",
"name": "BenchmarkSet1500 aggregated AI Ready Dataset",
"description": "High-Accuracy Excited-State Reference Benchmark Dataset for Organic Semiconductors. The BenchmarkSet1500 resource theme provides a dataset of multireference excited states for 1500 small organic semiconductors, alongside a Python-based workflow used to generate the associated high-level excited-state calculations. It is designed for researchers in organic electronics and data-driven chemistry who require reliable and reproducible excited-state data, as well as those developing machine learning models or screening pipelines. By combining standardised computational workflows with multi-level electronic structure methods (TD-DFT, CASSCF, NEVPT2), the resource enables reproducible data generation and delivers an AI-ready dataset suitable for structure-property analysis, direct quantum chemistry method comparison, and molecular design.",
"conformsTo": "http://mlcommons.org/croissant/1.0",
"citeAs": "@Article{furtherInformation1,title = 'Multireference Excited-State Screening Reveals Hidden Candidate Space in Organic Semiconductors', url='https://doi.org/DOIOFYOURPAPER'}",
"creator": [
{
"@type": "sc:Person",
"name": "Tahereh Nematiaram",
"url": "https://orcid.org/0000-0002-0371-4047"
},
{
"@type": "sc:Person",
"name": "Malin Zollner",
"url": "https://orcid.org/0009-0000-9662-0869"
}
],
"datePublished": "2026-04-17T00:00:00Z",
"license": "CC-BY-4.0 (https://spdx.org/licenses/CC-BY-4.0.html)",
"url": "https://data-collections.psdi.ac.uk/records/mktrj-smy12/latest",
"version": "1.0.0",
"rai:dataCollection": "The initial dataset was derived from the around 40,000 organic semiconductor molecules reported by Omar et al. (2022), which were originally curated from the Cambridge Structural Database (CSD). This parent set was assembled prior to this work and restricted to well-defined molecular crystals composed of elements commonly found in organic semiconductors, with polymeric systems, disordered solids, and co-crystals excluded. From this space, a subset was selected to enrich for electronically challenging systems relevant to excited-state screening, using TD-DFT-derived criteria targeting small singlet\u2013triplet gaps (S1-T1 < 0.275 eV), and signatures of double-excitation character (S2-S1 < 0.250 eV and f2-5f1 > 0.350). Additional random sampling of around 200 molecules was included to preserve chemical diversity, yielding around 1,500 molecules in total.",
"rai:dataCollectionType": [
"Calculations"
],
"rai:dataCollectionMissingData": "Not applicable",
"rai:dataCollectionRawData": "Initial raw data were geometric files obtained from the CCDC.",
"rai:dataAnnotationProtocol": "This data source was not annotated as such.",
"rai:dataAnnotationPlatform": "Not applicable",
"rai:dataAnnotationAnalysis": "Not applicable",
"rai:annotatorDemographics": "Not applicable",
"rai:machineAnnotationTools": "Not applicable",
"rai:annotationsPerItem": "Not applicable",
"rai:dataPreprocessingProtocol": [
"Not applicable"
],
"rai:dataManipulationProtocol": "The resource integrates standardised, fully automated workflows with multi-level electronic structure methods (CASSCF, NEVPT2) to generate reproducible excited-state data. It covers input generation for ground state optimisation using Gaussian 16, HPC job submission, error correction, input generation for excited state calculations using ORCA, and structured data extraction. The full workflow code is available at https://github.com/OrganicAI-Lab/PSDI_Benchmark_Set_1500.",
"rai:dataImputationProtocol": "Not applicable",
"rai:dataUseCases": [
"Designed for researchers in organic electronics and data-driven chemistry who require reliable and reproducible excited-state data, as well as those developing machine learning models or screening pipelines.",
"Structure-property analysis",
"direct quantum chemistry method comparison",
"molecular design"
],
"rai:dataBiases": [
"Not applicable"
],
"rai:personalSensitiveInformation": [
"No personal or sensitive information is included in the data."
],
"rai:dataSocialImpact": "Not applicable",
"rai:dataLimitations": [
"Not applicable"
],
"rai:dataReleaseMaintenancePlan": "The data are being released as a one off with no immediate plans for revisions.",
"distribution": [
{
"@type": "cr:FileObject",
"@id": "main_file",
"name": "BenchmarkSet1500.csv",
"description": "Multireference excited-state data file which contains aggregated data for organic semiconductors. Each row corresponds to a single molecule. Columns include molecular identifiers (CCDC ID, SMILES, InChI, formula, number of atoms, CCDC URL, DOI) and SA-CASSCF and NEVPT2 computed excited-state energies (S1, S2, T1, T2) and oscillator strengths (f1, f2).",
"contentSize": "103823 B",
"contentUrl": "./BenchmarkSet1500.csv",
"encodingFormat": "text/csv",
"sha256": "0b449f2e8f7d9fd46749205166dbde02f21b7fb52aa959b6007a72f27e1dec93"
}
],
"recordSet": [
{
"@type": "cr:RecordSet",
"@id": "main_file_recordset",
"name": "benchmarkset1500_datafile_recordset",
"description": "Multireference excited-state data file which contains aggregated data for organic semiconductors. ",
"key": {
"@id": "main_file_recordset/filename"
},
"field": [
{
"@type": "cr:Field",
"@id": "main_file_recordset/ID",
"name": "ID",
"description": "CCDC Molecule ID.",
"dataType": [
"sc:Text"
],
"source": {
"fileObject": {
"@id": "main_file"
},
"extract": {
"column": "ID"
}
}
},
{
"@type": "cr:Field",
"@id": "main_file_recordset/SMILES",
"name": "SMILES",
"description": "Canonical SMILES representation of the molecule.",
"dataType": [
"sc:Text"
],
"source": {
"fileObject": {
"@id": "main_file"
},
"extract": {
"column": "SMILES"
}
}
},
{
"@type": "cr:Field",
"@id": "main_file_recordset/InChI",
"name": "InChI",
"description": "InChI representation of the molecule.",
"dataType": [
"sc:Text"
],
"source": {
"fileObject": {
"@id": "main_file"
},
"extract": {
"column": "InChI"
}
}
},
{
"@type": "cr:Field",
"@id": "main_file_recordset/formula",
"name": "formula",
"description": "Molecular formula of the molecule.",
"dataType": [
"sc:Text"
],
"source": {
"fileObject": {
"@id": "main_file"
},
"extract": {
"column": "formula"
}
}
},
{
"@type": "cr:Field",
"@id": "main_file_recordset/#atoms",
"name": "Number of atoms",
"description": "Number of atoms of the molecule.",
"dataType": [
"sc:Float"
],
"source": {
"fileObject": {
"@id": "main_file"
},
"extract": {
"column": "#atoms"
}
}
},
{
"@type": "cr:Field",
"@id": "main_file_recordset/ccdc_url",
"name": "ccdc url",
"description": "Link to CCDC entry.",
"dataType": [
"sc:URL"
],
"source": {
"fileObject": {
"@id": "main_file"
},
"extract": {
"column": "ccdc_url"
}
}
},
{
"@type": "cr:Field",
"@id": "main_file_recordset/doi",
"name": "doi",
"description": "DOI of related publication.",
"dataType": [
"sc:Text"
],
"source": {
"fileObject": {
"@id": "main_file"
},
"extract": {
"column": "doi"
}
}
},
{
"@type": "cr:Field",
"@id": "main_file_recordset/SA_CASSCF_E(S1)",
"name": "SA-CASSCF E(S1)",
"description": "First singlet excited-state energy calculated using SA-CASSCF (eV).",
"dataType": [
"sc:Float"
],
"source": {
"fileObject": {
"@id": "main_file"
},
"extract": {
"column": "SA_CASSCF_E(S1)"
}
}
},
{
"@type": "cr:Field",
"@id": "main_file_recordset/SA_CASSCF_E(S1)",
"name": "NEVPT2_E(S1)",
"description": "First singlet excited-state energy calculated using NEVPT2 (eV).",
"dataType": [
"sc:Float"
],
"source": {
"fileObject": {
"@id": "main_file"
},
"extract": {
"column": "NEVPT2_E(S1)"
}
}
},
{
"@type": "cr:Field",
"@id": "main_file_recordset/SA_CASSCF_E(S2)",
"name": "SA_CASSCF_E(S2)",
"description": "Second singlet excited-state energy calculated using SA-CASSCF (eV).",
"dataType": [
"sc:Float"
],
"source": {
"fileObject": {
"@id": "main_file"
},
"extract": {
"column": "SA_CASSCF_E(S2)"
}
}
},
{
"@type": "cr:Field",
"@id": "main_file_recordset/NEVPT2_E(S2)",
"name": "NEVPT2 E(S2)",
"description": "Second singlet excited-state energy calculated using NEVPT2 (eV).",
"dataType": [
"sc:Float"
],
"source": {
"fileObject": {
"@id": "main_file"
},
"extract": {
"column": "NEVPT2_E(S2)"
}
}
},
{
"@type": "cr:Field",
"@id": "main_file_recordset/SA_CASSCF_E(T1)",
"name": "SA-CASSCF E(T1)",
"description": "First triplet excited-state energy calculated using SA-CASSCF (eV).",
"dataType": [
"sc:Float"
],
"source": {
"fileObject": {
"@id": "main_file"
},
"extract": {
"column": "SA_CASSCF_E(T1)"
}
}
},
{
"@type": "cr:Field",
"@id": "main_file_recordset/NEVPT2_E(T1)",
"name": "NEVPT2 E(T1)",
"description": "First triplet excited-state energy calculated using NEVPT2 (eV).",
"dataType": [
"sc:Float"
],
"source": {
"fileObject": {
"@id": "main_file"
},
"extract": {
"column": "NEVPT2_E(T1)"
}
}
},
{
"@type": "cr:Field",
"@id": "main_file_recordset/SA_CASSCF_E(T2)",
"name": "SA-CASSCF E(T2)",
"description": "Second triplet excited-state energy calculated using SA-CASSCF (eV).",
"dataType": [
"sc:Float"
],
"source": {
"fileObject": {
"@id": "main_file"
},
"extract": {
"column": "SA_CASSCF_E(T2)"
}
}
},
{
"@type": "cr:Field",
"@id": "main_file_recordset/NEVPT2_E(T2)",
"name": "NEVPT2 E(T2)",
"description": "Second triplet excited-state energy calculated using NEVPT2 (eV).",
"dataType": [
"sc:Float"
],
"source": {
"fileObject": {
"@id": "main_file"
},
"extract": {
"column": "NEVPT2_E(T2)"
}
}
},
{
"@type": "cr:Field",
"@id": "main_file_recordset/SA_CASSCF_f(S1)",
"name": "SA-CASSCF f(S1)",
"description": "S1 oscillator strength calculated using SA-CASSCF.",
"dataType": [
"sc:Float"
],
"source": {
"fileObject": {
"@id": "main_file"
},
"extract": {
"column": "SA_CASSCF_f(S1)"
}
}
},
{
"@type": "cr:Field",
"@id": "main_file_recordset/NEVPT2_f(S1)",
"name": "NEVPT2 f(S1)",
"description": "S1 oscillator strength calculated using NEVPT2.",
"dataType": [
"sc:Float"
],
"source": {
"fileObject": {
"@id": "main_file"
},
"extract": {
"column": "NEVPT2_f(S1)"
}
}
},
{
"@type": "cr:Field",
"@id": "main_file_recordset/SA_CASSCF_f(S2)",
"name": "SA-CASSCF f(S2)",
"description": "S2 oscillator strength calculated using SA-CASSCF.",
"dataType": [
"sc:Float"
],
"source": {
"fileObject": {
"@id": "main_file"
},
"extract": {
"column": "SA_CASSCF_f(S2)"
}
}
},
{
"@type": "cr:Field",
"@id": "main_file_recordset/NEVPT2_f(S2)",
"name": "NEVPT2 f(S2)",
"description": "S2 oscillator strength calculated using NEVPT2.",
"dataType": [
"sc:Float"
],
"source": {
"fileObject": {
"@id": "main_file"
},
"extract": {
"column": "NEVPT2_f(S2)"
}
}
}
]
}
]
}